This section includes importing the data, creating of new variables and establishing the dataframes for the initial analysis
dengue_features_test <- read.csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_features_test.csv", header = TRUE, stringsAsFactors = FALSE)
dengue_features_train <- read.csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_features_train.csv", header = TRUE, stringsAsFactors = FALSE)
dengue_labels_train <- read.csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/dengue_labels_train.csv", header = TRUE, stringsAsFactors = FALSE)
submission_format <- read.csv("D:/Google Drive/RYERSON/CKME 136/DengAI/DATASET/submission_format.csv", header = TRUE, stringsAsFactors = FALSE)
dengue_features_test$week_start_date <- as.Date(dengue_features_test$week_start_date, "%Y-%m-%d")
dengue_features_train$week_start_date <- as.Date(dengue_features_train$week_start_date, "%Y-%m-%d")
dengue_features_test$city <- as.factor(dengue_features_test$city)
dengue_features_train$city <- as.factor(dengue_features_train$city)
dengue_features_train$reanalysis_dew_point_temp_k <- dengue_features_train$reanalysis_dew_point_temp_k - 273.15
dengue_features_test$reanalysis_dew_point_temp_k <- dengue_features_test$reanalysis_dew_point_temp_k - 273.15
dengue_features_train$reanalysis_air_temp_k <- dengue_features_train$reanalysis_air_temp_k - 273.15
dengue_features_test$reanalysis_air_temp_k <- dengue_features_test$reanalysis_air_temp_k - 273.15
dengue_features_train$reanalysis_max_air_temp_k <- dengue_features_train$reanalysis_max_air_temp_k - 273.15
dengue_features_test$reanalysis_max_air_temp_k <- dengue_features_test$reanalysis_max_air_temp_k - 273.15
dengue_features_train$reanalysis_min_air_temp_k <- dengue_features_train$reanalysis_min_air_temp_k - 273.15
dengue_features_test$reanalysis_min_air_temp_k <- dengue_features_test$reanalysis_min_air_temp_k - 273.15
dengue_features_train$reanalysis_avg_temp_k <- dengue_features_train$reanalysis_avg_temp_k - 273.15
dengue_features_test$reanalysis_avg_temp_k <- dengue_features_test$reanalysis_avg_temp_k - 273.15
#!!!tdtr does not appear to be in Kelvin
# dengue_features_train$reanalysis_tdtr_k <- dengue_features_train$reanalysis_tdtr_k - 273.15
# dengue_features_test$reanalysis_tdtr_k <- dengue_features_test$reanalysis_tdtr_k - 273.15
summary(dengue_features_train$reanalysis_dew_point_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 16.49 20.97 22.49 22.10 23.31 25.30 10
summary(dengue_features_train$reanalysis_air_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.49 24.51 25.50 25.55 26.68 29.05 10
summary(dengue_features_train$reanalysis_max_air_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 24.65 27.85 29.25 30.28 32.35 40.85 10
summary(dengue_features_train$reanalysis_min_air_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.75 20.75 23.05 22.57 24.75 26.75 10
summary(dengue_features_train$reanalysis_avg_temp_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.74 25.11 26.14 26.08 27.06 29.78 10
summary(dengue_features_train$reanalysis_tdtr_k)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.357 2.329 2.857 4.904 7.625 16.029 10
df <- rbind(dengue_features_train,dengue_features_test)
iq_features_test <- dengue_features_test[dengue_features_test$city == 'iq', ]
sj_features_test <- dengue_features_test[dengue_features_test$city == 'sj', ]
iq_features_train <- dengue_features_train[dengue_features_train$city == 'iq', ]
sj_features_train <- dengue_features_train[dengue_features_train$city == 'sj', ]
iq_labels_train <- dengue_labels_train[dengue_labels_train$city == 'iq', ]
sj_labels_train <- dengue_labels_train[dengue_labels_train$city == 'sj', ]
sj <- rbind(sj_features_train,sj_features_test)
iq <- rbind(iq_features_train,iq_features_test)
df_train_labels <- merge(dengue_features_train, dengue_labels_train, by=c("city","year","weekofyear"))
sj_train_labels <- merge(sj_features_train, sj_labels_train, by=c("city","year","weekofyear"))
iq_train_labels <- merge(iq_features_train, iq_labels_train, by=c("city","year","weekofyear"))
In this section, we summary the value of the data frames (together and by city). We also create the following graphs
library(psych)
df_test.summary <- psych::describe(dengue_features_test, IQR=TRUE, quant=c(.25,.75) )
#View(df_test.summary)
df_train.summary <- psych::describe(dengue_features_train, IQR=TRUE, quant=c(.25,.75) )
#View(df_train.summary)
sj_train.summary <- psych::describe(sj_train_labels, IQR=TRUE, quant=c(.25,.75) )
#View(sj_train.summary)
iq_train.summary <- psych::describe(iq_train_labels, IQR=TRUE, quant=c(.25,.75) )
#View(iq_train.summary)
df.summary <- psych::describe(df, IQR=TRUE, quant=c(.25,.75))
#View(df.summary)
sj.summary <- psych::describe(sj, IQR=TRUE, quant=c(.25,.75) )
#View(sj.summary)
iq.summary <- psych::describe(iq, IQR=TRUE, quant=c(.25,.75) )
#View(iq.summary)
# summary(dengue_features_test$week_start_date)
# summary(dengue_features_train$week_start_date)
# summary(iq_features_train$week_start_date)
# summary(iq_features_test$week_start_date)
# summary(sj_features_train$week_start_date)
# summary(sj_features_test$week_start_date)
rm(df_test.summary, df_train.summary, sj_train.summary, iq_train.summary, df.summary, sj.summary, iq.summary)
These graphs only include data from the training set as it includes total cases. Climate data across both training and test sets are below.
#remove week_start_date for histogram
df_train_labels$week_start_date <- NULL
cnames <- colnames(df_train_labels)
par(mfrow=c(2,2))
for (i in 4:ncol(df_train_labels)) {
hist(df_train_labels[,i],
breaks = 20,
main = paste("Freq Histogram", cnames[i], sep = ": "),
xlab = cnames[i])
}
rm(cnames, i)
Same as above but only for SJ.
cnames <- colnames(df_train_labels)
par(mfrow=c(2,2))
for (i in 4:ncol(df_train_labels)) {
hist(df_train_labels[df_train_labels$city == "sj",i],
breaks = 20,
xlab = cnames[i],
main = paste("Freq Histogram for SJ", cnames[i], sep = ": "))
}
rm(cnames, i)
Same as above but only for IQ.
cnames <- colnames(df_train_labels)
par(mfrow=c(2,2))
for (i in 4:(ncol(df_train_labels))) {
hist(df_train_labels[df_train_labels$city == "iq",i],
breaks = 20,
xlab = cnames[i],
main = paste("Freq Histogram for IQ", cnames[i], sep = ": "))
}
rm(cnames, i)
Includes all the data from test and training set by time for SJ therefore the total_cases in not included. Total_cases by time is done separately.
cnames <- colnames(sj)
par(mfrow=c(2,2))
for (i in 5:(ncol(sj))) {
plot(sj$week_start_date,sj[,i],
type = "n",
ylim = c(min(sj[,i],na.rm=TRUE), max(sj[,i],na.rm=TRUE)),
ylab = cnames[i],
main = paste("Time Analysis for SJ", cnames[i], sep = ": "))
lines(sj$week_start_date,sj[,i])
}
rm(cnames, i)
Includes all the data from test and training set by time for I therefore the total_cases in not included. Total_cases by time is done separately.
cnames <- colnames(iq)
par(mfrow=c(2,2))
for (i in 5:(ncol(iq))) {
plot(iq$week_start_date,iq[,i],
type = "n",
ylim = c(min(iq[,i],na.rm=TRUE), max(iq[,i],na.rm=TRUE)),
ylab = cnames[i],
main = paste("Time Analysis for IQ", cnames[i], sep = ": "))
lines(iq$week_start_date,iq[,i])
}
rm(cnames, i)
Includes all the data from test and training set by time for SJ therefore the total_cases in not included. Total_cases by time is done separately.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
cnames <- colnames(sj)
par(mfrow=c(2,2))
for (i in 5:(ncol(sj))) {
gg1 <- ggplot(sj,
aes(x=weekofyear,
y = sj[,i],
group = weekofyear)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,52,1)) +
ylab(cnames[i]) +
ggtitle(paste(cnames[i]))
print(gg1)
}
## Warning: Removed 234 rows containing non-finite values (stat_boxplot).
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
## Warning: Removed 20 rows containing non-finite values (stat_boxplot).
## Warning: Removed 20 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
rm(cnames, i, gg1)
Includes all the data from test and training set by time for I therefore the total_cases in not included. Total_cases by time is done separately.
library(ggplot2)
cnames <- colnames(iq)
par(mfrow=c(2,2))
for (i in 5:(ncol(iq))) {
gg1 <- ggplot(sj,
aes(x=weekofyear,
y = sj[,i],
group = weekofyear)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,52,1)) +
ylab(cnames[i]) +
ggtitle(paste(cnames[i]))
print(gg1)
}
## Warning: Removed 234 rows containing non-finite values (stat_boxplot).
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
## Warning: Removed 20 rows containing non-finite values (stat_boxplot).
## Warning: Removed 20 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
## Warning: Removed 8 rows containing non-finite values (stat_boxplot).
rm(cnames, i, gg1)
Line graph of all data by total cases. This uses only the training set.
library(ggplot2)
df_train_labels <- merge(dengue_features_train, dengue_labels_train, by=c("city","year","weekofyear"))
par(mfcol=c(1,3))
# Dengue Cases both cities together
ggplot(data = df_train_labels, aes(x=week_start_date, y=total_cases)) +
geom_bar(stat = "identity", fill = "purple") +
labs(title = "Total Dengue Cases - both cities combined",
subtitle = paste(min(df_train_labels$week_start_date),max(df_train_labels$week_start_date), sep = " to "),
x = "Date", y = "Total dengue cases")
#Dengue Cases for San Jose
ggplot(data = df_train_labels[df_train_labels$city == "sj",], aes(x=week_start_date, y=total_cases)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Total Dengue Cases in San Jose",
subtitle = paste(min(df_train_labels$week_start_date[df_train_labels$city == "sj"]),max(df_train_labels$week_start_date[df_train_labels$city == "sj"]), sep = " to "),
x = "Date", y = "Total dengue cases")
# Dengue Cases for Iquitos
ggplot(data = df_train_labels[df_train_labels$city == "iq",], aes(x=week_start_date, y=total_cases)) +
geom_bar(stat = "identity", fill = "green") +
labs(title = "Total Dengue Cases in Iquitos",
subtitle = paste(min(df_train_labels$week_start_date[df_train_labels$city == "iq"]),max(df_train_labels$week_start_date[df_train_labels$city == "iq"]), sep = " to "),
x = "Date", y = "Total dengue cases")
Line graph of all data by total cases. This uses only the training set.
library(ggplot2)
gg1 <- ggplot(sj_train_labels,
aes(x=weekofyear,
y = total_cases,
group = weekofyear)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,52,1)) +
stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") +
ylab("Total cases") +
ggtitle(paste("Boxplot: Total cases by Week for SJ"))
print(gg1)
gg3 <- ggplot(data=sj_labels_train, aes(x=weekofyear, y=total_cases)) +
geom_bar(stat="summary", fun.y = "mean") +
ggtitle(paste("Bar graph: Average total cases by Week for SJ")) +
scale_x_continuous(breaks = seq(1,52, 2))
print(gg3)
gg2 <- ggplot(iq_train_labels,
aes(x=weekofyear,
y = total_cases,
group = weekofyear)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,52,1)) +
stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") +
ylab("Total cases") +
ggtitle(paste("Boxplot: Total cases by Week for IQ"))
print(gg2)
gg4 <- ggplot(data=iq_labels_train, aes(x=weekofyear, y=total_cases)) +
geom_bar(stat="summary", fun.y = "mean") +
ggtitle(paste("Bar graph: Average total cases by Week for IQ")) +
scale_x_continuous(breaks = seq(1,52, 2))
print(gg4)
rm(gg1, gg2, gg3, gg4)
Scatterplot using training set only.
cnames <- colnames(df_train_labels)
par(mfrow=c(2,2))
for (i in 5:(ncol(df_train_labels)-1)) {
plot(df_train_labels$total_cases,
df_train_labels[,i],
cex = 0.5,
pch = 19,
ylim = c(min(df_train_labels[,i],na.rm=TRUE), max(df_train_labels[,i],na.rm=TRUE)),
main = paste("Total_cases by climate variables", cnames[i], sep = ": "),
ylab = cnames[i])
}
rm(cnames, i)
Same as above but for SJ
cnames <- colnames(df_train_labels)
par(mfrow=c(2,2))
for (i in 5:(ncol(df_train_labels)-1)) {
plot(df_train_labels$total_cases[df_train_labels$city == "sj"],
df_train_labels[df_train_labels$city == "sj",i],
cex = 0.5,
pch = 19,
ylim = c(min(df_train_labels[,i],na.rm=TRUE), max(df_train_labels[,i],na.rm=TRUE)),
main = paste("Total_cases for SJ by climate variables", cnames[i], sep = ": "),
ylab = cnames[i])
}
rm(cnames, i)
Same as above but for IQ.
cnames <- colnames(df_train_labels)
par(mfrow=c(2,2))
for (i in 5:(ncol(df_train_labels)-1)) {
plot(df_train_labels$total_cases[df_train_labels$city == "iq"],
df_train_labels[df_train_labels$city == "iq",i],
cex = 0.5,
pch = 19,
ylim = c(min(df_train_labels[,i],na.rm=TRUE), max(df_train_labels[,i],na.rm=TRUE)),
main = paste("Total_cases for IQ by climate variables", cnames[i], sep = ": "),
ylab = cnames[i])
}
rm(cnames, i)
We can see that the same feature is significantly different in each city
cnames <- colnames(sj_train_labels)
for (i in 5:(ncol(sj_train_labels))){
wilt <- wilcox.test(sj_train_labels[,i],iq_train_labels[,i])
print(cnames[i])
print(wilt)
}
## [1] "ndvi_ne"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 21691, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "ndvi_nw"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 32596, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "ndvi_se"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 107990, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "ndvi_sw"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 78560, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "precipitation_amt_mm"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 118470, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_air_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 369950, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_avg_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 255790, p-value = 0.03716
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_dew_point_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 208230, p-value = 3.071e-05
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_max_air_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 4645.5, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_min_air_temp_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 474700, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_precip_amt_kg_per_m2"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 139740, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_relative_humidity_percent"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 62770, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_sat_precip_amt_mm"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 118470, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_specific_humidity_g_per_kg"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 192510, p-value = 4.502e-10
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "reanalysis_tdtr_k"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 22, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_avg_temp_c"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 183690, p-value = 1.887e-08
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_diur_temp_rng_c"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 6834, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_max_temp_c"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 59998, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_min_temp_c"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 361870, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "station_precip_mm"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 142000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
##
## [1] "total_cases"
##
## Wilcoxon rank sum test with continuity correction
##
## data: sj_train_labels[, i] and iq_train_labels[, i]
## W = 401310, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
rm(cnames, i, wilt)
There are several variables which appear to be the same feature but taken from a different source. For example, station_precip_mm and precipitation_amt_mm and reanalysis_sat_precip_amt_mm all appear to be the same “Total Precipitation value” Only one should be kept if they are the same.
“station_max_temp_c”" and “reanalysis_max_air_temp_k” (scaled to Celcius)
library(ggplot2)
#generate a difference in max temp variable
sj_train_labels$max_air_diff <- sj_train_labels$station_max_temp_c - sj_train_labels$reanalysis_max_air_temp_k
#barplot the difference by year
ggplot(sj_train_labels,aes(x=year, y=max_air_diff))+
geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).
#box plot difference by year
ggplot(sj_train_labels, aes(x=year, y = max_air_diff, group = year)) + geom_boxplot()
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
#Add month to the dataframe
sj_train_labels$month <- as.POSIXlt(sj_train_labels$week_start_date)$mon +1
#box plot difference by month
ggplot(sj_train_labels, aes(x=month, y = max_air_diff, group = month)) + geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
sj_train_labels$max_air_diff <- NULL
sj_train_labels$month <- NULL
“station_min_temp_c”" and “reanalysis_min_air_temp_k” (scaled to Celcius)
library(ggplot2)
#generate a difference in max temp variable
sj_train_labels$min_air_diff <- sj_train_labels$station_min_temp_c - sj_train_labels$reanalysis_min_air_temp_k
#barplot the difference by year
ggplot(sj_train_labels,aes(x=year, y=min_air_diff))+
geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).
#box plot difference by year
ggplot(sj_train_labels, aes(x=year, y = min_air_diff, group = year)) + geom_boxplot()
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
#Add month to the dataframe
sj_train_labels$month <- as.POSIXlt(sj_train_labels$week_start_date)$mon +1
#box plot difference by month
ggplot(sj_train_labels, aes(x=month, y = min_air_diff, group = month)) + geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
sj_train_labels$min_air_diff <- NULL
sj_train_labels$month <- NULL
“station_avg_temp_c”" and “reanalysis_avg_temp_k” (scaled to Celcius)
library(ggplot2)
#generate a difference in max temp variable
sj_train_labels$avg_air_diff <- sj_train_labels$station_avg_temp_c - sj_train_labels$reanalysis_avg_temp_k
#barplot the difference by year
ggplot(sj_train_labels,aes(x=year, y=avg_air_diff))+
geom_bar(stat='identity')
## Warning: Removed 6 rows containing missing values (position_stack).
#box plot difference by year
ggplot(sj_train_labels, aes(x=year, y = avg_air_diff, group = year)) + geom_boxplot()
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
#Add month to the dataframe
sj_train_labels$month <- as.POSIXlt(sj_train_labels$week_start_date)$mon +1
#box plot difference by month
ggplot(sj_train_labels, aes(x=month, y = avg_air_diff, group = month)) + geom_boxplot() + scale_x_continuous(breaks=seq(1,12,1))
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
sj_train_labels$avg_air_diff <- NULL
sj_train_labels$month <- NULL
“station_precip_mm”, “precipitation_amt_mm”, “reanalysis_sat_precip_amt_mm”, “reanalysis_precip_amt_kg_per_m2”
library(ggplot2)
precip <- c("station_precip_mm", "precipitation_amt_mm", "reanalysis_sat_precip_amt_mm", "reanalysis_precip_amt_kg_per_m2")
#Add month to the dataframe
sj_train_labels$month <- as.POSIXlt(sj_train_labels$week_start_date)$mon +1
for (i in 1:3){
par(mfrow=c(1,3))
#generate the first variable in the list
p1 <- precip[i]
ind1 <- which(colnames(sj_train_labels)==p1)
for (j in ((i+1):4)){
#generate the next variable in the list
p2 <- precip[j]
ind2 <- which(colnames(sj_train_labels)==p2)
#generate a difference variable
sj_train_labels$diff <- sj_train_labels[,ind1] - sj_train_labels[,ind2]
#barplot the difference by year
gg1 <-ggplot(sj_train_labels,
aes(x=year, y=diff))+
geom_bar(stat = "identity", fill="steelblue") +
ggtitle(paste(p1, "&", p2))
print(gg1)
#box plot the difference by year
gg2 <-ggplot(sj_train_labels,
aes(x=year, y=diff, group = year)) +
geom_boxplot() +
ggtitle(paste(p1, "&", p2))
print(gg2)
#box plot difference by month
gg3 <- ggplot(sj_train_labels,
aes(x=month, y = diff, group = month)) +
geom_boxplot() +
scale_x_continuous(breaks=seq(1,12,1)) +
ggtitle(paste(p1, "&", p2))
print(gg3)
}
}
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (position_stack).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing missing values (position_stack).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
sj_train_labels$diff <- NULL
sj_train_labels$month <- NULL
rm(gg1, gg2, gg3, i, ind1, ind2, j, p1, p2, precip)
This section of the exploratory analysis will review the effects of the major components of climate affect dengue cases. A 5x cross validation decision tree algorithm will be used to review the MAE error by year.
library(rpart)
library(plyr)
form <- "total_cases ~ ndvi_se + ndvi_sw + ndvi_ne + ndvi_nw"
folds <- split(sj_train_labels,
cut(sample(1:nrow(sj_train_labels)),10))
errs.c50 <- rep(NA, length(folds))
for (i in 1:length(folds)) {
test <- ldply(folds[i], data.frame)
train <- ldply(folds[-i], data.frame)
tmp.model <- rpart(as.formula(form), train)
tmp.predict <- predict(tmp.model, newdata=test)
conf.mat <- table(test$total_cases, tmp.predict)
errs.c50[i] <- 1 - sum(diag(conf.mat))/sum(conf.mat)
}
print(sprintf("average error using k-fold cross validation and decision tree algorithm: %.3f percent", 100*mean(abs(errs.c50))))
## [1] "average error using k-fold cross validation and decision tree algorithm: 97.973 percent"
rm(folds, test, tmp.model, train, conf.mat, errs.c50, form, i, tmp.predict)
library(rpart)
library(plyr)
form <- "total_cases ~ station_max_temp_c + station_min_temp_c + station_avg_temp_c + station_diur_temp_rng_c + reanalysis_dew_point_temp_k + reanalysis_air_temp_k + reanalysis_max_air_temp_k + reanalysis_min_air_temp_k + reanalysis_avg_temp_k + reanalysis_tdtr_k"
folds <- split(sj_train_labels,
cut(sample(1:nrow(sj_train_labels)),10))
errs.c50 <- rep(NA, length(folds))
for (i in 1:length(folds)) {
test <- ldply(folds[i], data.frame)
train <- ldply(folds[-i], data.frame)
tmp.model <- rpart(as.formula(form), train)
tmp.predict <- predict(tmp.model, newdata=test)
conf.mat <- table(test$total_cases, tmp.predict)
errs.c50[i] <- 1 - sum(diag(conf.mat))/sum(conf.mat)
}
print(sprintf("average error using k-fold cross validation and decision tree algorithm: %.3f percent", 100*mean(abs(errs.c50))))
## [1] "average error using k-fold cross validation and decision tree algorithm: 97.113 percent"
rm(folds, test, tmp.model, train, conf.mat, errs.c50, form, i, tmp.predict)
library(rpart)
library(plyr)
form <- "total_cases ~ station_precip_mm + precipitation_amt_mm + reanalysis_sat_precip_amt_mm + reanalysis_precip_amt_kg_per_m2"
folds <- split(sj_train_labels,
cut(sample(1:nrow(sj_train_labels)),10))
errs.c50 <- rep(NA, length(folds))
for (i in 1:length(folds)) {
test <- ldply(folds[i], data.frame)
train <- ldply(folds[-i], data.frame)
tmp.model <- rpart(as.formula(form), train)
tmp.predict <- predict(tmp.model, newdata=test)
conf.mat <- table(test$total_cases, tmp.predict)
errs.c50[i] <- 1 - sum(diag(conf.mat))/sum(conf.mat)
}
print(sprintf("average error using k-fold cross validation and decision tree algorithm: %.3f percent", 100*mean(abs(errs.c50))))
## [1] "average error using k-fold cross validation and decision tree algorithm: 97.112 percent"
rm(folds, test, tmp.model, train, conf.mat, errs.c50, form, i, tmp.predict)
library(rpart)
library(plyr)
form <- "total_cases ~ reanalysis_relative_humidity_percent + reanalysis_specific_humidity_g_per_kg"
folds <- split(sj_train_labels,
cut(sample(1:nrow(sj_train_labels)),10))
errs.c50 <- rep(NA, length(folds))
for (i in 1:length(folds)) {
test <- ldply(folds[i], data.frame)
train <- ldply(folds[-i], data.frame)
tmp.model <- rpart(as.formula(form), train)
tmp.predict <- predict(tmp.model, newdata=test)
conf.mat <- table(test$total_cases, tmp.predict)
errs.c50[i] <- 1 - sum(diag(conf.mat))/sum(conf.mat)
}
print(sprintf("average error using k-fold cross validation and decision tree algorithm: %.3f percent", 100*mean(abs(errs.c50))))
## [1] "average error using k-fold cross validation and decision tree algorithm: 98.077 percent"
rm(folds, test, tmp.model, train, conf.mat, errs.c50, form, i, tmp.predict)
Boxplot includes test and training set - NA still included
library(ggplot2)
cnames <- colnames(df)
for (i in 5:(ncol(df))) {
p <- ggplot(df, aes(x=city, y = df[,i], fill = city)) +
geom_boxplot() +
labs(title = "Boxplot of climate variables",
subtitle = cnames[i],
x = "City", y = cnames[i])
print(p)
}
## Warning: Removed 237 rows containing non-finite values (stat_boxplot).
## Warning: Removed 63 rows containing non-finite values (stat_boxplot).
## Warning: Removed 23 rows containing non-finite values (stat_boxplot).
## Warning: Removed 23 rows containing non-finite values (stat_boxplot).
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 15 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
## Warning: Removed 55 rows containing non-finite values (stat_boxplot).
## Warning: Removed 55 rows containing non-finite values (stat_boxplot).
## Warning: Removed 23 rows containing non-finite values (stat_boxplot).
## Warning: Removed 23 rows containing non-finite values (stat_boxplot).
## Warning: Removed 27 rows containing non-finite values (stat_boxplot).
rm(cnames, i, p)
library(ggplot2)
ggplot(df_train_labels, aes(x=city, y = total_cases, fill = city)) +
geom_boxplot() +
labs(title = "Boxplot of Total_cases",
x = "City", y = "Total_cases")
Clean up all the extra dataframes produced during the exploratory analysis
rm(dengue_features_test,
dengue_features_train,
dengue_labels_train,
sj_features_test,
sj_features_train,
sj_labels_train,
iq_features_test,
iq_features_train,
iq_labels_train,
df,
iq,
sj,
df_train_labels,
submission_format
)
In this section, we look at the number of missing values. Later we will do something about these missing values.
sj_train.na <- sapply(sj_train_labels, function(x) sum(is.na (x)))
iq_train.na <- sapply(iq_train_labels, function(x) sum(is.na (x)))
#df_train_labels.na <- sum(is.na(df_train_labels$total_cases))
# View(sj_train.na)
# View(iq_train.na)
#df_train_labels.na
rm(sj_train.na)
rm(iq_train.na)
#rm(df_train_labels.na)
sj_train_labels.naomit <- na.omit(sj_train_labels)
iq_train_labels.naomit <- na.omit(iq_train_labels)
library(zoo)
#library(tidyverse)
library(plyr)
sj_train_labels <- sj_train_labels[order(sj_train_labels$year, sj_train_labels$weekofyear),]
iq_train_labels <- iq_train_labels[order(iq_train_labels$year, iq_train_labels$weekofyear),]
sj_train_labels.lastna <- sj_train_labels
iq_train_labels.lastna <-iq_train_labels
sj_train_labels.lastna <- colwise(na.locf)(sj_train_labels.lastna)
iq_train_labels.lastna <- colwise(na.locf)(iq_train_labels.lastna)
#Issues using tidyverse as the locf function converts all values to character
# sj_train_labels.lastna <- sj_train_labels.lastna %>% do(na.locf(.))
# iq_train_labels.lastna <-iq_train_labels.lastna %>% do(na.locf(.))
sum(is.na(sj_train_labels.lastna))
sum(is.na(iq_train_labels.lastna))
Removing the city and the week_start_date from the dataset wil allow for easier analysis
#keep a version with the start week included
sj_train_labels.startweek <- sj_train_labels.lastna
iq_train_labels.startweek <- iq_train_labels.lastna
#remove city
sj_train_labels.naomit$city <- NULL
sj_train_labels.lastna$city <- NULL
sj_train_labels.startweek$city <- NULL
iq_train_labels.naomit$city <- NULL
iq_train_labels.lastna$city <- NULL
iq_train_labels.startweek$city <- NULL
#remove week_start_date
sj_train_labels.naomit$week_start_date <- NULL
sj_train_labels.lastna$week_start_date <- NULL
iq_train_labels.naomit$week_start_date <- NULL
iq_train_labels.lastna$week_start_date <- NULL
In this section, we look at the correlation between the total_cases and the climate variables.
First we need to remove any of the non-numeric variables. The missing values are still in this first correlation analysis but this will be repeated with the missing values included.
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.4
## corrplot 0.84 loaded
#Correlation for SJ
sj_train_labels.cor <-data.frame(round(cor(sj_train_labels[,5:25],
use = "complete.obs"),2))
#View(sj_train_labels.cor)
corrplot(as.matrix(sj_train_labels.cor),
method = "number",
type = "upper",
title = "Corrplot for SJ")
#Correlation for IQ
iq_train_labels.cor <-data.frame(round(cor(iq_train_labels[,5:25],
use = "complete.obs"),2))
#View(iq_train_labels.cor)
plot.new()
corrplot(as.matrix(iq_train_labels.cor),
method = "number",
type = "upper",
title = "Corrplot for IQ")
rm(iq_train_labels.cor, sj_train_labels.cor)
# write.csv(df_train_labels.cor, file = "df_train_labels.cor.csv")
# write.csv(sj_train_labels.cor, file = "sj_train_labels.cor.csv")
# write.csv(iq_train_labels.cor, file = "iq_train_labels.cor.csv")
library(corrplot)
sj_train_labels.cor.naomit <-data.frame(round(cor(sj_train_labels.naomit,
use = "complete.obs"),2))
#View(sj_train_labels.cor.naomit)
corrplot(as.matrix(sj_train_labels.cor.naomit),
method = "number",
type = "upper")
iq_train_labels.cor.naomit <-data.frame(round(cor(iq_train_labels.naomit,
use = "complete.obs"),2))
#View(iq_train_labels.cor.naomit)
corrplot(as.matrix(iq_train_labels.cor.naomit),
method = "number",
type = "upper")
rm(sj_train_labels.cor.naomit)
rm(iq_train_labels.cor.naomit)
library(corrplot)
sj_train_labels.cor.lastna <-data.frame(round(cor(sj_train_labels.lastna,
use = "complete.obs"),2))
#View(sj_train_labels.cor.lastna)
corrplot(as.matrix(sj_train_labels.cor.lastna),
method = "number",
type = "upper")
iq_train_labels.cor.lastna <-data.frame(round(cor(iq_train_labels.lastna,
use = "complete.obs"),2))
#SView(iq_train_labels.cor.lastna)
corrplot(as.matrix(iq_train_labels.cor.lastna),
type = "upper",
tl.pos = "td",
method = "number",
tl.cex = 0.75,
tl.col = 'black',
order = "hclust",
number.cex= 7/ncol(iq_train_labels.lastna),
diag = FALSE)
rm(iq_train_labels.cor.lastna, sj_train_labels.cor.lastna)
# write.csv(sj_train_labels.cor.naomit, file = "sj_train_labels.cor.naomit.csv")
# write.csv(iq_train_labels.cor.naomit, file = "iq_train_labels.cor.naomit.csv")
#
# write.csv(sj_train_labels.cor.lastna, file = "sj_train_labels.cor.lastna.csv")
# write.csv(iq_train_labels.cor.lastna, file = "iq_train_labels.cor.lastna.csv")
Different methods of imputing missing values had no impact on correlation. Will stick with last.na as the final version.
rm(sj_train_labels.naomit, iq_train_labels.naomit)
We will use various methods to see if we can find any features that need to be eliminated
library(mlbench)
## Warning: package 'mlbench' was built under R version 3.4.4
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
# calculate correlation matrix
CorrelationMatrix <- cor(sj_train_labels.lastna)
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(CorrelationMatrix, cutoff=0.75)
# print indexes of highly correlated attributes
print(highlyCorrelated)
## [1] 16 10 8 9 18 11 12 7 5
cnames <- colnames(sj_train_labels.lastna)
for (i in list(highlyCorrelated)){
print(cnames[i])
}
## [1] "reanalysis_specific_humidity_g_per_kg"
## [2] "reanalysis_dew_point_temp_k"
## [3] "reanalysis_air_temp_k"
## [4] "reanalysis_avg_temp_k"
## [5] "station_avg_temp_c"
## [6] "reanalysis_max_air_temp_k"
## [7] "reanalysis_min_air_temp_k"
## [8] "precipitation_amt_mm"
## [9] "ndvi_se"
rm(CorrelationMatrix, cnames, highlyCorrelated, i)
library(mlbench)
library(caret)
# calculate correlation matrix
CorrelationMatrix <- cor(iq_train_labels.lastna)
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(CorrelationMatrix, cutoff=0.75)
# print indexes of highly correlated attributes
print(highlyCorrelated)
## [1] 17 11 16 10 9 7 5 3 4
cnames <- colnames(iq_train_labels.lastna)
for (i in list(highlyCorrelated)){
print(cnames[i])
}
## [1] "reanalysis_tdtr_k"
## [2] "reanalysis_max_air_temp_k"
## [3] "reanalysis_specific_humidity_g_per_kg"
## [4] "reanalysis_dew_point_temp_k"
## [5] "reanalysis_avg_temp_k"
## [6] "precipitation_amt_mm"
## [7] "ndvi_se"
## [8] "ndvi_ne"
## [9] "ndvi_nw"
rm(CorrelationMatrix, cnames, highlyCorrelated, i)
library(mlbench)
library(caret)
# define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
# run the RFE algorithm for SJ
sj_rfe_results <- rfe(sj_train_labels.lastna[,3:23], sj_train_labels.lastna$total_cases, sizes=c(3:23), rfeControl=control)
# summarize the results
print(sj_rfe_results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 3 9.681 0.9680 3.684 5.092 0.03069 1.0893
## 4 12.558 0.9506 5.822 5.287 0.04434 1.3020
## 5 14.865 0.9352 7.815 5.114 0.05084 1.3636
## 6 9.546 0.9710 4.067 4.250 0.02190 0.9598
## 7 10.545 0.9698 5.009 4.154 0.01747 0.8538
## 8 12.113 0.9624 6.169 4.326 0.02403 1.0039
## 9 9.230 0.9769 4.107 4.114 0.01536 0.9095 *
## 10 10.485 0.9714 4.946 4.436 0.01720 1.0601
## 11 11.594 0.9674 5.689 4.410 0.01741 1.0692
## 12 9.271 0.9777 4.165 4.001 0.01326 0.9115
## 13 10.111 0.9757 4.811 4.042 0.01364 0.9933
## 14 11.075 0.9720 5.355 4.347 0.01667 0.9896
## 15 9.465 0.9766 4.295 4.220 0.01530 0.9034
## 16 10.638 0.9732 4.846 4.445 0.01570 1.0422
## 17 11.066 0.9720 5.328 4.477 0.01431 1.0534
## 18 9.718 0.9773 4.338 4.203 0.01211 0.9156
## 19 10.446 0.9737 4.820 4.190 0.01481 0.8921
## 20 11.218 0.9722 5.196 4.782 0.01323 1.0548
## 21 9.755 0.9773 4.323 4.396 0.01117 1.0104
##
## The top 5 variables (out of 9):
## total_cases, ndvi_se, ndvi_sw, reanalysis_specific_humidity_g_per_kg, ndvi_nw
# list the chosen features
predictors(sj_rfe_results)
## [1] "total_cases"
## [2] "ndvi_se"
## [3] "ndvi_sw"
## [4] "reanalysis_specific_humidity_g_per_kg"
## [5] "ndvi_nw"
## [6] "reanalysis_dew_point_temp_k"
## [7] "ndvi_ne"
## [8] "reanalysis_precip_amt_kg_per_m2"
## [9] "reanalysis_tdtr_k"
# plot the results
plot(sj_rfe_results, type=c("g", "o"), main = "RFE plot for SJ")
rm(sj_rfe_results, control)
library(mlbench)
library(caret)
# define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
# run the RFE algorithm for IQ
iq_rfe_results <- rfe(iq_train_labels.lastna, iq_train_labels.lastna$total_cases, sizes=c(3:23), rfeControl=control)
# summarize the results
print(iq_rfe_results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 3 2.589 0.9661 1.051 2.299 0.02912 0.4917 *
## 4 3.397 0.9458 1.554 2.611 0.03555 0.5639
## 5 4.025 0.9234 1.971 2.694 0.03823 0.6190
## 6 2.988 0.9499 1.147 2.730 0.04625 0.5304
## 7 3.377 0.9409 1.416 2.843 0.04864 0.5655
## 8 3.691 0.9287 1.663 3.036 0.05770 0.6507
## 9 3.142 0.9422 1.203 2.982 0.05878 0.5789
## 10 3.401 0.9355 1.394 3.057 0.05896 0.6049
## 11 3.610 0.9277 1.541 3.141 0.06659 0.6535
## 12 3.211 0.9390 1.225 3.069 0.06600 0.5827
## 13 3.424 0.9319 1.371 3.057 0.06752 0.5859
## 14 3.592 0.9250 1.495 3.160 0.07359 0.6175
## 15 3.288 0.9332 1.282 3.088 0.06966 0.5768
## 16 3.455 0.9283 1.374 3.148 0.07302 0.5879
## 17 3.593 0.9260 1.501 3.229 0.07503 0.6474
## 18 3.308 0.9323 1.284 3.207 0.07764 0.6014
## 19 3.437 0.9312 1.395 3.217 0.07764 0.6183
## 20 3.562 0.9235 1.445 3.231 0.07958 0.6064
## 21 3.359 0.9300 1.328 3.185 0.07615 0.6103
## 22 3.476 0.9265 1.421 3.246 0.07899 0.6244
## 23 3.637 0.9220 1.504 3.171 0.07339 0.6025
##
## The top 3 variables (out of 3):
## total_cases, year, station_avg_temp_c
# list the chosen features
predictors(iq_rfe_results)
## [1] "total_cases" "year" "station_avg_temp_c"
# plot the results
plot(iq_rfe_results, type=c("g", "o"), main = "RFE plot for IQ")
rm(iq_rfe_results, control)
# ensure results are repeatable
set.seed(136)
# load the library
library(mlbench)
library(caret)
# prepare training scheme
control <- trainControl(method="repeatedcv", number=11, repeats=1)
# train the model
model <- train(total_cases~., data=sj_train_labels.lastna[,3:23], method="cforest", preProcess="scale", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## cforest variable importance
##
## Overall
## ndvi_se 2707.068
## ndvi_sw 1264.511
## reanalysis_specific_humidity_g_per_kg 524.414
## reanalysis_max_air_temp_k 232.114
## reanalysis_tdtr_k 68.758
## station_avg_temp_c 47.346
## station_max_temp_c 38.592
## reanalysis_precip_amt_kg_per_m2 35.714
## ndvi_nw 31.800
## reanalysis_relative_humidity_percent 26.746
## station_min_temp_c 25.271
## ndvi_ne 22.196
## reanalysis_dew_point_temp_k 17.165
## station_diur_temp_rng_c 14.770
## reanalysis_min_air_temp_k 11.496
## precipitation_amt_mm 10.432
## station_precip_mm 6.900
## reanalysis_air_temp_k 4.880
## reanalysis_avg_temp_k 1.967
## reanalysis_sat_precip_amt_mm 0.000
# plot importance
plot(importance)
model$finalModel
##
## Random Forest using Conditional Inference Trees
##
## Number of trees: 500
##
## Response: .outcome
## Inputs: ndvi_ne, ndvi_nw, ndvi_se, ndvi_sw, precipitation_amt_mm, reanalysis_air_temp_k, reanalysis_avg_temp_k, reanalysis_dew_point_temp_k, reanalysis_max_air_temp_k, reanalysis_min_air_temp_k, reanalysis_precip_amt_kg_per_m2, reanalysis_relative_humidity_percent, reanalysis_sat_precip_amt_mm, reanalysis_specific_humidity_g_per_kg, reanalysis_tdtr_k, station_avg_temp_c, station_diur_temp_rng_c, station_max_temp_c, station_min_temp_c, station_precip_mm
## Number of observations: 936
rm(model, importance, control, GCtorture)
# ensure results are repeatable
set.seed(136)
# load the library
library(mlbench)
library(caret)
# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
model <- train(total_cases~., data=iq_train_labels.lastna[,3:23], method="cforest", preProcess="scale", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## cforest variable importance
##
## Overall
## reanalysis_specific_humidity_g_per_kg 3.318451
## reanalysis_dew_point_temp_k 2.940082
## reanalysis_min_air_temp_k 1.674939
## reanalysis_tdtr_k 1.378423
## reanalysis_precip_amt_kg_per_m2 1.285543
## station_min_temp_c 1.212590
## reanalysis_relative_humidity_percent 1.087495
## reanalysis_avg_temp_k 1.078973
## reanalysis_air_temp_k 0.908769
## station_avg_temp_c 0.842579
## station_max_temp_c 0.559030
## station_diur_temp_rng_c 0.129754
## reanalysis_sat_precip_amt_mm 0.115857
## station_precip_mm 0.047493
## ndvi_nw 0.012152
## ndvi_sw 0.007328
## ndvi_se -0.037845
## reanalysis_max_air_temp_k -0.095271
## precipitation_amt_mm -0.168698
## ndvi_ne -0.267532
# plot importance
plot(importance)
rm(model, importance, control, GCtorture)
## Warning in rm(model, importance, control, GCtorture): object 'GCtorture'
## not found
library(Boruta)
## Warning: package 'Boruta' was built under R version 3.4.4
## Loading required package: ranger
## Warning: package 'ranger' was built under R version 3.4.4
sj_train_labels.boruta <- Boruta(sj_train_labels.lastna$total_cases~., data = sj_train_labels.lastna, doTrace = 2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## 12. run of importance source...
## After 12 iterations, +29 secs:
## confirmed 14 attributes: ndvi_ne, ndvi_nw, ndvi_se, ndvi_sw, reanalysis_avg_temp_k and 9 more;
## still have 8 attributes left.
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## 16. run of importance source...
## After 16 iterations, +38 secs:
## confirmed 2 attributes: reanalysis_air_temp_k, station_min_temp_c;
## still have 6 attributes left.
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## After 19 iterations, +45 secs:
## confirmed 1 attribute: station_max_temp_c;
## still have 5 attributes left.
## 20. run of importance source...
## 21. run of importance source...
## 22. run of importance source...
## 23. run of importance source...
## 24. run of importance source...
## 25. run of importance source...
## 26. run of importance source...
## 27. run of importance source...
## 28. run of importance source...
## 29. run of importance source...
## 30. run of importance source...
## 31. run of importance source...
## 32. run of importance source...
## 33. run of importance source...
## 34. run of importance source...
## 35. run of importance source...
## 36. run of importance source...
## 37. run of importance source...
## 38. run of importance source...
## 39. run of importance source...
## 40. run of importance source...
## 41. run of importance source...
## 42. run of importance source...
## 43. run of importance source...
## 44. run of importance source...
## 45. run of importance source...
## 46. run of importance source...
## 47. run of importance source...
## 48. run of importance source...
## 49. run of importance source...
## 50. run of importance source...
## 51. run of importance source...
## 52. run of importance source...
## 53. run of importance source...
## 54. run of importance source...
## 55. run of importance source...
## 56. run of importance source...
## After 56 iterations, +2.1 mins:
## confirmed 1 attribute: station_precip_mm;
## still have 4 attributes left.
## 57. run of importance source...
## 58. run of importance source...
## 59. run of importance source...
## 60. run of importance source...
## 61. run of importance source...
## 62. run of importance source...
## 63. run of importance source...
## 64. run of importance source...
## 65. run of importance source...
## 66. run of importance source...
## 67. run of importance source...
## 68. run of importance source...
## 69. run of importance source...
## 70. run of importance source...
## 71. run of importance source...
## 72. run of importance source...
## 73. run of importance source...
## 74. run of importance source...
## 75. run of importance source...
## 76. run of importance source...
## 77. run of importance source...
## 78. run of importance source...
## After 78 iterations, +3 mins:
## confirmed 1 attribute: reanalysis_sat_precip_amt_mm;
## still have 3 attributes left.
## 79. run of importance source...
## 80. run of importance source...
## 81. run of importance source...
## 82. run of importance source...
## 83. run of importance source...
## 84. run of importance source...
## 85. run of importance source...
## 86. run of importance source...
## 87. run of importance source...
## 88. run of importance source...
## After 88 iterations, +3.3 mins:
## confirmed 1 attribute: precipitation_amt_mm;
## still have 2 attributes left.
## 89. run of importance source...
## 90. run of importance source...
## 91. run of importance source...
## After 91 iterations, +3.4 mins:
## confirmed 1 attribute: station_diur_temp_rng_c;
## still have 1 attribute left.
## 92. run of importance source...
## 93. run of importance source...
## 94. run of importance source...
## 95. run of importance source...
## 96. run of importance source...
## 97. run of importance source...
## 98. run of importance source...
## 99. run of importance source...
print(sj_train_labels.boruta)
## Boruta performed 99 iterations in 3.745824 mins.
## 21 attributes confirmed important: ndvi_ne, ndvi_nw, ndvi_se,
## ndvi_sw, precipitation_amt_mm and 16 more;
## No attributes deemed unimportant.
## 1 tentative attributes left: reanalysis_tdtr_k;
#Fix and tentative attributes
sj_train_labels.boruta <- TentativeRoughFix(sj_train_labels.boruta)
print(sj_train_labels.boruta)
## Boruta performed 99 iterations in 3.745824 mins.
## Tentatives roughfixed over the last 99 iterations.
## 22 attributes confirmed important: ndvi_ne, ndvi_nw, ndvi_se,
## ndvi_sw, precipitation_amt_mm and 17 more;
## No attributes deemed unimportant.
#Boruta plot for SJ
plot(sj_train_labels.boruta, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(sj_train_labels.boruta$ImpHistory),function(i)
sj_train_labels.boruta$ImpHistory[is.finite(sj_train_labels.boruta$ImpHistory[,i]),i])
names(lz) <- colnames(sj_train_labels.boruta$ImpHistory)
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(sj_train_labels.boruta$ImpHistory), cex.axis = 0.7)
rm(lz, Labels, sj_train_labels.boruta)
library(Boruta)
iq_train_labels.boruta <- Boruta(iq_train_labels.lastna$total_cases~., data = iq_train_labels.lastna, doTrace = 2)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## 12. run of importance source...
## After 12 iterations, +15 secs:
## confirmed 7 attributes: reanalysis_dew_point_temp_k, reanalysis_min_air_temp_k, reanalysis_precip_amt_kg_per_m2, reanalysis_specific_humidity_g_per_kg, station_avg_temp_c and 2 more;
## still have 15 attributes left.
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## 16. run of importance source...
## After 16 iterations, +19 secs:
## confirmed 3 attributes: reanalysis_air_temp_k, reanalysis_relative_humidity_percent, station_max_temp_c;
## rejected 2 attributes: ndvi_nw, ndvi_sw;
## still have 10 attributes left.
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## After 19 iterations, +22 secs:
## confirmed 1 attribute: reanalysis_avg_temp_k;
## still have 9 attributes left.
## 20. run of importance source...
## 21. run of importance source...
## 22. run of importance source...
## After 22 iterations, +25 secs:
## rejected 1 attribute: station_precip_mm;
## still have 8 attributes left.
## 23. run of importance source...
## 24. run of importance source...
## 25. run of importance source...
## 26. run of importance source...
## After 26 iterations, +29 secs:
## rejected 1 attribute: ndvi_ne;
## still have 7 attributes left.
## 27. run of importance source...
## 28. run of importance source...
## 29. run of importance source...
## After 29 iterations, +32 secs:
## confirmed 1 attribute: reanalysis_tdtr_k;
## still have 6 attributes left.
## 30. run of importance source...
## 31. run of importance source...
## 32. run of importance source...
## 33. run of importance source...
## 34. run of importance source...
## 35. run of importance source...
## 36. run of importance source...
## 37. run of importance source...
## 38. run of importance source...
## 39. run of importance source...
## 40. run of importance source...
## 41. run of importance source...
## 42. run of importance source...
## 43. run of importance source...
## 44. run of importance source...
## 45. run of importance source...
## 46. run of importance source...
## 47. run of importance source...
## 48. run of importance source...
## 49. run of importance source...
## 50. run of importance source...
## 51. run of importance source...
## 52. run of importance source...
## 53. run of importance source...
## 54. run of importance source...
## 55. run of importance source...
## 56. run of importance source...
## 57. run of importance source...
## 58. run of importance source...
## 59. run of importance source...
## 60. run of importance source...
## 61. run of importance source...
## 62. run of importance source...
## 63. run of importance source...
## 64. run of importance source...
## 65. run of importance source...
## 66. run of importance source...
## 67. run of importance source...
## 68. run of importance source...
## 69. run of importance source...
## 70. run of importance source...
## 71. run of importance source...
## 72. run of importance source...
## 73. run of importance source...
## 74. run of importance source...
## 75. run of importance source...
## 76. run of importance source...
## 77. run of importance source...
## 78. run of importance source...
## 79. run of importance source...
## 80. run of importance source...
## 81. run of importance source...
## After 81 iterations, +1.4 mins:
## rejected 1 attribute: ndvi_se;
## still have 5 attributes left.
## 82. run of importance source...
## 83. run of importance source...
## 84. run of importance source...
## 85. run of importance source...
## 86. run of importance source...
## 87. run of importance source...
## 88. run of importance source...
## 89. run of importance source...
## 90. run of importance source...
## 91. run of importance source...
## 92. run of importance source...
## 93. run of importance source...
## 94. run of importance source...
## 95. run of importance source...
## 96. run of importance source...
## 97. run of importance source...
## 98. run of importance source...
## 99. run of importance source...
print(iq_train_labels.boruta)
## Boruta performed 99 iterations in 1.640086 mins.
## 12 attributes confirmed important: reanalysis_air_temp_k,
## reanalysis_avg_temp_k, reanalysis_dew_point_temp_k,
## reanalysis_min_air_temp_k, reanalysis_precip_amt_kg_per_m2 and 7
## more;
## 5 attributes confirmed unimportant: ndvi_ne, ndvi_nw, ndvi_se,
## ndvi_sw, station_precip_mm;
## 5 tentative attributes left: precipitation_amt_mm,
## reanalysis_max_air_temp_k, reanalysis_sat_precip_amt_mm,
## station_diur_temp_rng_c, station_min_temp_c;
#Fix and tentative attributes
iq_train_labels.boruta <- TentativeRoughFix(iq_train_labels.boruta)
print(iq_train_labels.boruta)
## Boruta performed 99 iterations in 1.640086 mins.
## Tentatives roughfixed over the last 99 iterations.
## 14 attributes confirmed important: precipitation_amt_mm,
## reanalysis_air_temp_k, reanalysis_avg_temp_k,
## reanalysis_dew_point_temp_k, reanalysis_min_air_temp_k and 9 more;
## 8 attributes confirmed unimportant: ndvi_ne, ndvi_nw, ndvi_se,
## ndvi_sw, reanalysis_max_air_temp_k and 3 more;
#Boruta plot for IQ
plot(iq_train_labels.boruta, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(iq_train_labels.boruta$ImpHistory),function(i)
iq_train_labels.boruta$ImpHistory[is.finite(iq_train_labels.boruta$ImpHistory[,i]),i])
names(lz) <- colnames(iq_train_labels.boruta$ImpHistory)
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(iq_train_labels.boruta$ImpHistory), cex.axis = 0.7)
rm(lz, Labels, iq_train_labels.boruta)
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.4
library(ggplot2)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ranger':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:psych':
##
## outlier
library(caret)
#Fit a model
model_sj.rf <- randomForest(sj_train_labels.startweek$total_cases ~
ndvi_ne +
ndvi_nw +
ndvi_se +
ndvi_sw +
precipitation_amt_mm +
reanalysis_air_temp_k +
reanalysis_avg_temp_k +
reanalysis_dew_point_temp_k +
reanalysis_max_air_temp_k +
reanalysis_min_air_temp_k +
reanalysis_precip_amt_kg_per_m2 +
reanalysis_relative_humidity_percent +
reanalysis_sat_precip_amt_mm +
reanalysis_specific_humidity_g_per_kg +
reanalysis_tdtr_k + station_avg_temp_c +
station_diur_temp_rng_c +
station_max_temp_c +
station_min_temp_c +
station_precip_mm,
sj_train_labels.startweek,
importance = TRUE,
ntree=1000)
#How many trees are needed to reach the minimum error estimate?
which.min(model_sj.rf$mse)
## [1] 665
plot(model_sj.rf)
#Find the importance of the RF model
imp <- as.data.frame(sort(importance(model_sj.rf)[,1],decreasing = TRUE),optional = T)
names(imp) <- "% Inc MSE"
imp
## % Inc MSE
## ndvi_se 37.701045
## ndvi_sw 23.312328
## reanalysis_dew_point_temp_k 13.287574
## reanalysis_specific_humidity_g_per_kg 12.960603
## ndvi_nw 12.753006
## ndvi_ne 12.341615
## station_precip_mm 12.207094
## reanalysis_min_air_temp_k 12.109047
## reanalysis_relative_humidity_percent 10.933707
## reanalysis_precip_amt_kg_per_m2 10.373380
## reanalysis_tdtr_k 8.822743
## reanalysis_max_air_temp_k 8.705044
## station_avg_temp_c 7.945887
## reanalysis_air_temp_k 7.590120
## reanalysis_sat_precip_amt_mm 7.526884
## station_min_temp_c 7.114981
## reanalysis_avg_temp_k 6.957742
## station_max_temp_c 5.571364
## precipitation_amt_mm 5.289672
## station_diur_temp_rng_c 4.998066
#graph the importance
varImpPlot(model_sj.rf, type = 1)
varImpPlot(model_sj.rf, type = 2)
rm(model_sj.rf, imp)
library(reshape2)
library(ggplot2)
library(randomForest)
library(caret)
#Fit a model
model_iq.rf <- randomForest(iq_train_labels.startweek$total_cases ~
ndvi_ne +
ndvi_nw +
ndvi_se +
ndvi_sw +
precipitation_amt_mm +
reanalysis_air_temp_k +
reanalysis_avg_temp_k +
reanalysis_dew_point_temp_k +
reanalysis_max_air_temp_k +
reanalysis_min_air_temp_k +
reanalysis_precip_amt_kg_per_m2 +
reanalysis_relative_humidity_percent +
reanalysis_sat_precip_amt_mm +
reanalysis_specific_humidity_g_per_kg +
reanalysis_tdtr_k + station_avg_temp_c +
station_diur_temp_rng_c +
station_max_temp_c +
station_min_temp_c +
station_precip_mm,
iq_train_labels.startweek,
importance = TRUE,
ntree=1000)
#How many trees are needed to reach the minimum error estimate?
which.min(model_iq.rf$mse)
## [1] 880
plot(model_iq.rf)
#Find the importance of the RF model
imp <- as.data.frame(sort(importance(model_iq.rf)[,1],decreasing = TRUE),optional = T)
names(imp) <- "% Inc MSE"
imp
## % Inc MSE
## reanalysis_specific_humidity_g_per_kg 14.598346
## station_avg_temp_c 13.816486
## reanalysis_precip_amt_kg_per_m2 12.857407
## reanalysis_dew_point_temp_k 12.609141
## station_max_temp_c 12.320637
## reanalysis_relative_humidity_percent 10.737072
## reanalysis_tdtr_k 9.481949
## reanalysis_min_air_temp_k 8.021002
## station_diur_temp_rng_c 7.831725
## reanalysis_max_air_temp_k 7.098665
## reanalysis_air_temp_k 7.072691
## reanalysis_avg_temp_k 6.403049
## ndvi_ne 5.636981
## ndvi_sw 5.358412
## reanalysis_sat_precip_amt_mm 4.912512
## ndvi_se 4.140040
## precipitation_amt_mm 4.040661
## station_precip_mm 2.886239
## station_min_temp_c 2.712698
## ndvi_nw 1.982185
#graph the importance
varImpPlot(model_iq.rf, type = 1)
varImpPlot(model_iq.rf, type = 2)
rm(model_iq.rf, imp)
#calculate the max value by year and sort by highest cases
max_cases.year <-sort(tapply(sj_train_labels.lastna$total_cases, sj_train_labels.lastna$year, max), decreasing = TRUE)
max_cases.year
## 1994 1998 2007 1991 1995 2005 1997 1992 1999 2001 1990 1993 2003 2000 2002
## 461 329 170 169 154 137 112 104 77 75 71 46 41 38 38
## 1996 2006 2004 2008
## 35 33 27 15
#Determine which weeks are associated to which max year values
dname <- dimnames(max_cases.year)
for (i in 1:6) {
max_cases.week <-
sort(tapply(sj_train_labels.lastna$total_cases[sj_train_labels.lastna$year == as.numeric(dname[[1]][i])], sj_train_labels.lastna$weekofyear[sj_train_labels.lastna$year == as.numeric(dname[[1]][i])], max), decreasing = TRUE)
print(dname[[1]][i])
print(max_cases.week[1:15])
}
## [1] "1994"
## 41 40 45 39 42 46 47 44 43 38 48 37 49 36 35
## 461 426 410 395 381 364 359 353 333 302 288 272 221 202 179
## [1] "1998"
## 32 33 31 34 35 30 36 29 27 28 41 26 40 37 51
## 329 263 256 220 204 191 181 150 128 127 127 102 102 99 89
## [1] "2007"
## 40 41 37 38 42 39 35 34 33 32 36 43 44 45 30
## 170 135 112 106 106 101 92 76 75 72 71 68 48 48 42
## [1] "1991"
## 48 42 49 40 44 45 43 47 46 41 50 39 38 51 34
## 169 142 141 140 140 140 129 129 127 116 108 92 89 78 76
## [1] "1995"
## 52 1 2 3 43 4 36 44 5 42 46 48 37 39 38
## 154 91 72 56 48 46 40 40 37 36 36 34 33 33 31
## [1] "2005"
## 35 36 33 34 37 31 32 38 30 39 29 41 42 43 44
## 137 131 126 119 112 83 82 82 75 73 56 55 55 53 46
rm(dname, i, max_cases.week, max_cases.year)
Five peaks will be isolated from each city with 5 weeks around each side of the max yearly value. A new dataframe will be made for use in mutual information and then for prediction.
Add a binary variable “peak” for logistic regression purposes (peak = 1)
sj.peak.1994 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1994 & sj_train_labels.lastna$weekofyear <= 46 & sj_train_labels.lastna$weekofyear >= 36 ,]
sj.peak.1998 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1998 & sj_train_labels.lastna$weekofyear <= 47 & sj_train_labels.lastna$weekofyear >= 37 ,]
sj.peak.2007 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 2007 & sj_train_labels.lastna$weekofyear <= 45 & sj_train_labels.lastna$weekofyear >= 35 ,]
sj.peak.1991 <- rbind(sj_train_labels.lastna[sj_train_labels.lastna$year == 1991 & sj_train_labels.lastna$weekofyear <= 52 & sj_train_labels.lastna$weekofyear >= 43 ,],
sj_train_labels.lastna[sj_train_labels.lastna$year == 1992 & sj_train_labels.lastna$weekofyear == 1 ,] )
sj.peak.2005 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 2005 & sj_train_labels.lastna$weekofyear <= 40 & sj_train_labels.lastna$weekofyear >= 30 ,]
sj.peak <-rbind(sj.peak.1991,sj.peak.1994,sj.peak.1998,sj.peak.2005,sj.peak.2007)
sj.peak$peak <- 1
rm(sj.peak.1991,sj.peak.1994,sj.peak.1998,sj.peak.2005,sj.peak.2007)
Add a binary variable “nonpeak” for logistic regression purposes (peak = 0)
sj.nonpeak.1997 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1997 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]
sj.nonpeak.2001 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 2001 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]
sj.nonpeak.2003 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 2003 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]
sj.nonpeak.1993 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1993 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]
sj.nonpeak.1996 <- sj_train_labels.lastna[sj_train_labels.lastna$year == 1996 & sj_train_labels.lastna$weekofyear <= 22 & sj_train_labels.lastna$weekofyear >= 12 ,]
sj.nonpeak <-rbind(sj.nonpeak.1997,
sj.nonpeak.2001,
sj.nonpeak.2003,
sj.nonpeak.1993,
sj.nonpeak.1996)
rm(sj.nonpeak.1997,sj.nonpeak.2001,sj.nonpeak.2003,sj.nonpeak.1993,sj.nonpeak.1996)
sj.nonpeak$peak <- 0
sj_peak.glm <- rbind(sj.peak, sj.nonpeak)
# Fit a logistic regression model
fit_glm <- glm(sj_peak.glm$peak ~ .,
sj_peak.glm,
family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# generate summary
summary(fit_glm)
##
## Call:
## glm(formula = sj_peak.glm$peak ~ ., family = "binomial", data = sj_peak.glm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.661e-05 -2.110e-08 0.000e+00 2.110e-08 1.585e-05
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) 8.124e+03 7.892e+07 0
## year -1.285e+00 2.564e+04 0
## weekofyear 5.992e-01 1.313e+04 0
## ndvi_ne 5.914e+01 1.025e+06 0
## ndvi_nw -3.671e+01 1.408e+06 0
## ndvi_se 2.318e+02 3.178e+06 0
## ndvi_sw -2.042e+02 3.702e+06 0
## precipitation_amt_mm -1.663e-01 1.923e+03 0
## reanalysis_air_temp_k -1.604e+02 2.583e+06 0
## reanalysis_avg_temp_k -4.632e+01 9.555e+05 0
## reanalysis_dew_point_temp_k 1.302e+02 2.787e+06 0
## reanalysis_max_air_temp_k -1.354e+01 1.593e+05 0
## reanalysis_min_air_temp_k -4.127e+00 2.013e+05 0
## reanalysis_precip_amt_kg_per_m2 2.574e-01 2.709e+03 0
## reanalysis_relative_humidity_percent -5.288e+01 5.464e+05 0
## reanalysis_sat_precip_amt_mm NA NA NA
## reanalysis_specific_humidity_g_per_kg 1.277e+02 1.006e+06 0
## reanalysis_tdtr_k 4.768e+00 1.680e+05 0
## station_avg_temp_c -1.567e+01 3.547e+05 0
## station_diur_temp_rng_c -1.341e+01 1.116e+05 0
## station_max_temp_c 8.618e+00 1.346e+05 0
## station_min_temp_c -1.397e+01 1.565e+05 0
## station_precip_mm -1.428e-01 2.519e+03 0
## total_cases 2.963e-01 1.900e+03 0
## Pr(>|z|)
## (Intercept) 1
## year 1
## weekofyear 1
## ndvi_ne 1
## ndvi_nw 1
## ndvi_se 1
## ndvi_sw 1
## precipitation_amt_mm 1
## reanalysis_air_temp_k 1
## reanalysis_avg_temp_k 1
## reanalysis_dew_point_temp_k 1
## reanalysis_max_air_temp_k 1
## reanalysis_min_air_temp_k 1
## reanalysis_precip_amt_kg_per_m2 1
## reanalysis_relative_humidity_percent 1
## reanalysis_sat_precip_amt_mm NA
## reanalysis_specific_humidity_g_per_kg 1
## reanalysis_tdtr_k 1
## station_avg_temp_c 1
## station_diur_temp_rng_c 1
## station_max_temp_c 1
## station_min_temp_c 1
## station_precip_mm 1
## total_cases 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.5249e+02 on 109 degrees of freedom
## Residual deviance: 1.7801e-09 on 87 degrees of freedom
## AIC: 46
##
## Number of Fisher Scoring iterations: 25
rm(sj_peak.glm, fit_glm)
This analysis looks at the mutual information for a peak prediction model. Five peaks will be used
library(entropy)
library(infotheo)
##
## Attaching package: 'infotheo'
## The following objects are masked from 'package:entropy':
##
## discretize, entropy
mu <- data.frame()
cnames <- colnames(sj.peak)
for (i in 5:(ncol(sj.peak)-2)) {
disc1 <- discretize(sj.peak$total_cases)
disc2 <- discretize(sj.peak[,i])
mu[i-4,1] <- cnames[i]
mu[i-4,2] <- mutinformation(disc1, disc2)
}
mu[order(mu$V2, decreasing = TRUE),]
## V1 V2
## 1 ndvi_se 0.22008822
## 15 station_diur_temp_rng_c 0.10178915
## 6 reanalysis_dew_point_temp_k 0.08947514
## 12 reanalysis_specific_humidity_g_per_kg 0.07098685
## 7 reanalysis_max_air_temp_k 0.06446396
## 18 station_precip_mm 0.05977442
## 8 reanalysis_min_air_temp_k 0.05513424
## 4 reanalysis_air_temp_k 0.05412362
## 17 station_min_temp_c 0.05227911
## 5 reanalysis_avg_temp_k 0.04437999
## 13 reanalysis_tdtr_k 0.04282410
## 10 reanalysis_relative_humidity_percent 0.03870491
## 9 reanalysis_precip_amt_kg_per_m2 0.03184109
## 16 station_max_temp_c 0.03036126
## 2 ndvi_sw 0.02205445
## 14 station_avg_temp_c 0.02192181
## 3 precipitation_amt_mm 0.01964312
## 11 reanalysis_sat_precip_amt_mm 0.01964312
rm(mu, cnames, i, disc1, disc2)
library(entropy)
library(infotheo)
mu <- data.frame()
cnames <- colnames(sj.nonpeak)
for (i in 5:(ncol(sj.nonpeak)-2)) {
disc1 <- discretize(sj.nonpeak$total_cases)
disc2 <- discretize(sj.nonpeak[,i])
mu[i-4,1] <- cnames[i]
mu[i-4,2] <- mutinformation(disc1, disc2)
}
mu[order(mu$V2, decreasing = TRUE),]
## V1 V2
## 1 ndvi_se 0.09648317
## 5 reanalysis_avg_temp_k 0.06407748
## 4 reanalysis_air_temp_k 0.06179880
## 2 ndvi_sw 0.05977442
## 9 reanalysis_precip_amt_kg_per_m2 0.04481389
## 8 reanalysis_min_air_temp_k 0.04417511
## 10 reanalysis_relative_humidity_percent 0.04358990
## 18 station_precip_mm 0.03305411
## 17 station_min_temp_c 0.02549627
## 14 station_avg_temp_c 0.02374231
## 13 reanalysis_tdtr_k 0.02148998
## 7 reanalysis_max_air_temp_k 0.01964312
## 15 station_diur_temp_rng_c 0.01804743
## 3 precipitation_amt_mm 0.01703681
## 11 reanalysis_sat_precip_amt_mm 0.01703681
## 16 station_max_temp_c 0.01473814
## 12 reanalysis_specific_humidity_g_per_kg 0.01138601
## 6 reanalysis_dew_point_temp_k 0.01076950
rm(mu, cnames, i, disc1, disc2)
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
sj_peak.nb <- rbind(sj.peak, sj.nonpeak)
# Fit a Naive Bayes model
fit_nb <- naiveBayes(sj_peak.nb$total_cases ~ ., sj_peak.nb)
# generate summary
summary(fit_nb)
## Length Class Mode
## apriori 62 table numeric
## tables 23 -none- list
## levels 0 -none- NULL
## call 4 -none- call
#remove fit_nb
rm(fit_nb, sj_peak.nb)
rm(sj.peak, sj.nonpeak)
The following predictive models will reivew the Root Square Mean Error by each city. This is done without feature selection and with missing values imputed as the last non-na values.
set.seed(136)
# randomly pick 80% of the number of observations
index.sj <- sample(1:nrow(sj_train_labels.startweek),size = 0.8*nrow(sj_train_labels.startweek))
# subset train_labels to include only the elements in the index
train.sj <- sj_train_labels.startweek[index.sj,]
# subset train_labels to include all but the elements in the index
validation.sj <- sj_train_labels.startweek[-index.sj,]
nrow(train.sj)
## [1] 748
nrow(validation.sj)
## [1] 188
# # Create a dataframe with train and test indicator...
# group <- rep(NA,nrow(sj_train_labels.startweek))
#
# group <- ifelse(seq(1,nrow(sj_train_labels.startweek)) %in% index,"Train","Validation")
#
# df <- data.frame(date=sj_train_labels.startweek$week_start_date,cases=sj_train_labels.startweek$total_cases,group)
#
# # ...and plot it
# ggplot(df,aes(x = date,y = cases, color = group)) + geom_point() +
# scale_color_discrete(name="") + theme(legend.position="top")
rm(index.sj)
set.seed(136)
# randomly pick 80% of the number of observations
index.iq <- sample(1:nrow(iq_train_labels.startweek),size = 0.8*nrow(iq_train_labels.startweek))
# subset train_labels to include only the elements in the index
train.iq <- iq_train_labels.startweek[index.iq,]
# subset train_labels to include all but the elements in the index
validation.iq <- iq_train_labels.startweek[-index.iq,]
nrow(train.iq)
## [1] 416
nrow(validation.iq)
## [1] 104
rm(index.iq)
The baseline model shifts the total_cases down by one so that the values fall down to the next week. The difference between the orignal and the shifted values are taken and the RMSE is used as the metric to measure performance.
#create a new data frame from the lastna dataframe
sj_train_labels.shift <- sj_train_labels.lastna
#Make a copy of the total_cases variable
sj_train_labels.shift$total_cases2 <- sj_train_labels.shift$total_cases
#shift the values down by one
sj_train_labels.shift['total_cases2'] <- c(NA, head(sj_train_labels.shift['total_cases2'], dim(sj_train_labels.shift)[1] - 1)[[1]])
#replace the first NA with zero
sj_train_labels.shift$total_cases2[1] <- 0
#take the difference between total_cases and total_cases2
sj_train_labels.shift$diff <- sj_train_labels.shift$total_cases2 - sj_train_labels.shift$total_cases
# Evaluate RMSE and MAE on the validation data
RMSE.SJ.baseline1 <- sqrt(mean((sj_train_labels.shift$diff)^2))
RMSE.SJ.baseline1
## [1] 15.34033
MAE.SJ.baseline1 <- mean(abs(sj_train_labels.shift$diff))
MAE.SJ.baseline1
## [1] 8.394231
rm(sj_train_labels.shift)
The baseline model shifts the total_cases down by one so that the values fall down to the next week. The difference between the orignal and the shifted values are taken and the RMSE is used as the metric to measure performance.
#create a new data frame from the lastna dataframe
iq_train_labels.shift <- iq_train_labels.lastna
#Make a copy of the total_cases variable
iq_train_labels.shift$total_cases2 <- iq_train_labels.shift$total_cases
#shift the values down by one
iq_train_labels.shift['total_cases2'] <- c(NA, head(iq_train_labels.shift['total_cases2'], dim(iq_train_labels.shift)[1] - 1)[[1]])
#replace the first NA with zero
iq_train_labels.shift$total_cases2[1] <- 0
#take the difference between total_cases and total_cases2
iq_train_labels.shift$diff <- iq_train_labels.shift$total_cases2 - iq_train_labels.shift$total_cases
# Evaluate RMSE and MAE on the validation data
RMSE.IQ.baseline1 <- sqrt(mean((iq_train_labels.shift$diff)^2))
RMSE.IQ.baseline1
## [1] 7.660086
MAE.IQ.baseline1 <- mean(abs(iq_train_labels.shift$diff))
MAE.IQ.baseline1
## [1] 3.930769
rm(iq_train_labels.shift)
#Here is a plot showing which points belong to which set (train or test).
library(ggplot2)
# Baseline model - predict the mean of the training data
best.guess.sj <- mean(train.sj$total_cases)
# Evaluate RMSE and MAE on the validation data
RMSE.SJ.baseline2 <- sqrt(mean((best.guess.sj-validation.sj$total_cases)^2))
RMSE.SJ.baseline2
## [1] 44.41748
MAE.SJ.baseline2 <- mean(abs(best.guess.sj-validation.sj$total_cases))
MAE.SJ.baseline2
## [1] 27.03268
rm(best.guess.sj)
#Here is a plot showing which points belong to which set (train or test).
library(ggplot2)
# Baseline model - predict the mean of the training data
best.guess.iq <- mean(train.iq$total_cases)
# Evaluate RMSE and MAE on the validation data
RMSE.IQ.baseline2 <- sqrt(mean((best.guess.iq-validation.iq$total_cases)^2))
RMSE.IQ.baseline2
## [1] 11.6964
MAE.IQ.baseline2 <- mean(abs(best.guess.iq-validation.iq$total_cases))
MAE.IQ.baseline2
## [1] 7.383275
rm(best.guess.iq)
library(MASS)
library(reshape2)
library(ggplot2)
#determine the dispersion of the total_cases
round(with(sj_train_labels.startweek, mean(total_cases),2))
## [1] 34
round(with(sj_train_labels.startweek, var(total_cases),2))
## [1] 2640
#As there is over-dispersion of total_cases (variance is greater than the mean) we can go ahead and build the NBR model
#Build the model
model_sj.nbr <- glm.nb(formula = total_cases ~ ., data = train.sj[,4:24])
summary(model_sj.nbr)
##
## Call:
## glm.nb(formula = total_cases ~ ., data = train.sj[, 4:24], init.theta = 1.238752535,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6005 -1.0132 -0.4303 0.2511 3.7686
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -36.095981 35.906521 -1.005
## ndvi_ne -0.075816 0.439385 -0.173
## ndvi_nw 1.745488 0.506357 3.447
## ndvi_se -6.437402 0.998317 -6.448
## ndvi_sw 5.462943 1.033934 5.284
## precipitation_amt_mm -0.002296 0.001069 -2.148
## reanalysis_air_temp_k 2.859593 1.787091 1.600
## reanalysis_avg_temp_k -0.882312 0.449115 -1.965
## reanalysis_dew_point_temp_k -2.793170 1.662257 -1.680
## reanalysis_max_air_temp_k 0.301373 0.106441 2.831
## reanalysis_min_air_temp_k -0.052397 0.112896 -0.464
## reanalysis_precip_amt_kg_per_m2 0.001093 0.001273 0.858
## reanalysis_relative_humidity_percent 0.400739 0.369438 1.085
## reanalysis_sat_precip_amt_mm NA NA NA
## reanalysis_specific_humidity_g_per_kg 1.028983 0.466649 2.205
## reanalysis_tdtr_k -0.608646 0.118727 -5.126
## station_avg_temp_c -0.222566 0.114877 -1.937
## station_diur_temp_rng_c 0.055151 0.071591 0.770
## station_max_temp_c 0.041047 0.057318 0.716
## station_min_temp_c -0.007992 0.070262 -0.114
## station_precip_mm -0.001965 0.001690 -1.163
## Pr(>|z|)
## (Intercept) 0.314764
## ndvi_ne 0.863005
## ndvi_nw 0.000567 ***
## ndvi_se 1.13e-10 ***
## ndvi_sw 1.27e-07 ***
## precipitation_amt_mm 0.031687 *
## reanalysis_air_temp_k 0.109568
## reanalysis_avg_temp_k 0.049465 *
## reanalysis_dew_point_temp_k 0.092890 .
## reanalysis_max_air_temp_k 0.004635 **
## reanalysis_min_air_temp_k 0.642565
## reanalysis_precip_amt_kg_per_m2 0.390702
## reanalysis_relative_humidity_percent 0.278043
## reanalysis_sat_precip_amt_mm NA
## reanalysis_specific_humidity_g_per_kg 0.027451 *
## reanalysis_tdtr_k 2.95e-07 ***
## station_avg_temp_c 0.052694 .
## station_diur_temp_rng_c 0.441080
## station_max_temp_c 0.473911
## station_min_temp_c 0.909437
## station_precip_mm 0.244969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(1.2388) family taken to be 1)
##
## Null deviance: 1085.5 on 747 degrees of freedom
## Residual deviance: 827.3 on 728 degrees of freedom
## AIC: 6640.9
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 1.2388
## Std. Err.: 0.0614
##
## 2 x log-likelihood: -6598.8710
prediction_sj.nbr <- predict(model_sj.nbr, validation.sj, type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
#Plot the prediction for NBR
df_prediction_sj.nbr <- data.frame('prediction' = prediction_sj.nbr,
'actual' = validation.sj$total_cases,
'time' = validation.sj$week_start_date)
df_prediction_sj.nbr <- melt(df_prediction_sj.nbr, id.vars = 'time')
ggplot(df_prediction_sj.nbr, aes(x = time, y = value, color = variable)) +
geom_line() +
ggtitle('NBR: Dengue predicted Cases vs. Actual Cases (City-San Juan) ')
# Evaluate RMSE and MAE on the validation data
RMSE.SJ.nbr <- sqrt(mean((prediction_sj.nbr-validation.sj$total_cases)^2))
RMSE.SJ.nbr
## [1] 42.50416
MAE.SJ.nbr <- mean(abs(prediction_sj.nbr-validation.sj$total_cases))
MAE.SJ.nbr
## [1] 26.15555
rm(df_prediction_sj.nbr, model_sj.nbr, prediction_sj.nbr)
library(MASS)
library(reshape2)
library(ggplot2)
#determine the dispersion of the total_cases
round(with(iq_train_labels.startweek, mean(total_cases),2))
## [1] 8
round(with(iq_train_labels.startweek, var(total_cases),2))
## [1] 116
#As there is over-dispersion of total_cases (variance is greater than the mean) we can go ahead and build the NBR model
#Build the model
model_iq.nbr <- glm.nb(formula = total_cases ~ ., data = train.iq[,4:24])
summary(model_iq.nbr)
##
## Call:
## glm.nb(formula = total_cases ~ ., data = train.iq[, 4:24], init.theta = 0.8927711016,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3298 -1.1448 -0.3802 0.3018 3.6056
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -3.9555345 14.4163166 -0.274
## ndvi_ne 2.0225152 1.5012052 1.347
## ndvi_nw 1.3923607 1.2471222 1.116
## ndvi_se -4.5143238 1.1676946 -3.866
## ndvi_sw -0.3853344 1.3088897 -0.294
## precipitation_amt_mm -0.0004742 0.0019193 -0.247
## reanalysis_air_temp_k 0.3821218 0.6047721 0.632
## reanalysis_avg_temp_k -0.1713872 0.2741650 -0.625
## reanalysis_dew_point_temp_k -1.4903126 0.8783992 -1.697
## reanalysis_max_air_temp_k -0.0937201 0.0551876 -1.698
## reanalysis_min_air_temp_k 0.0119852 0.0798743 0.150
## reanalysis_precip_amt_kg_per_m2 -0.0023324 0.0014441 -1.615
## reanalysis_relative_humidity_percent 0.0330521 0.1314110 0.252
## reanalysis_sat_precip_amt_mm NA NA NA
## reanalysis_specific_humidity_g_per_kg 1.5909235 0.7529229 2.113
## reanalysis_tdtr_k 0.0354126 0.0897535 0.395
## station_avg_temp_c 0.0127747 0.1083638 0.118
## station_diur_temp_rng_c 0.0277937 0.0668043 0.416
## station_max_temp_c 0.1462946 0.0704404 2.077
## station_min_temp_c 0.0706209 0.0738444 0.956
## station_precip_mm 0.0009194 0.0009481 0.970
## Pr(>|z|)
## (Intercept) 0.783793
## ndvi_ne 0.177896
## ndvi_nw 0.264226
## ndvi_se 0.000111 ***
## ndvi_sw 0.768454
## precipitation_amt_mm 0.804843
## reanalysis_air_temp_k 0.527489
## reanalysis_avg_temp_k 0.531889
## reanalysis_dew_point_temp_k 0.089768 .
## reanalysis_max_air_temp_k 0.089468 .
## reanalysis_min_air_temp_k 0.880725
## reanalysis_precip_amt_kg_per_m2 0.106282
## reanalysis_relative_humidity_percent 0.801414
## reanalysis_sat_precip_amt_mm NA
## reanalysis_specific_humidity_g_per_kg 0.034601 *
## reanalysis_tdtr_k 0.693172
## station_avg_temp_c 0.906157
## station_diur_temp_rng_c 0.677376
## station_max_temp_c 0.037815 *
## station_min_temp_c 0.338897
## station_precip_mm 0.332217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.8928) family taken to be 1)
##
## Null deviance: 561.45 on 415 degrees of freedom
## Residual deviance: 474.29 on 396 degrees of freedom
## AIC: 2495.4
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.8928
## Std. Err.: 0.0737
##
## 2 x log-likelihood: -2453.4450
prediction_iq.nbr <- predict(model_iq.nbr, validation.iq, type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
#Plot the prediction for NBR
df_prediction_iq.nbr <- data.frame('prediction' = prediction_iq.nbr,
'actual' = validation.iq$total_cases,
'time' = validation.iq$week_start_date)
df_prediction_iq.nbr <- melt(df_prediction_iq.nbr, id.vars = 'time')
ggplot(df_prediction_iq.nbr, aes(x = time, y = value, color = variable)) +
geom_line() +
ggtitle('NBR: Dengue predicted Cases vs. Actual Cases (City-IQUITOS) ')
# Evaluate RMSE and MAE on the validation data
RMSE.IQ.nbr <- sqrt(mean((prediction_iq.nbr-validation.iq$total_cases)^2))
RMSE.IQ.nbr
## [1] 11.934
MAE.IQ.nbr <- mean(abs(prediction_iq.nbr-validation.iq$total_cases))
MAE.IQ.nbr
## [1] 7.742887
rm(df_prediction_iq.nbr, model_iq.nbr, prediction_iq.nbr)
library(kernlab)
## Warning: package 'kernlab' was built under R version 3.4.4
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
## The following object is masked from 'package:psych':
##
## alpha
library(reshape2)
library(ggplot2)
#Build the model
model_sj.svm <- ksvm(total_cases ~ ., data = train.sj, kernel = "vanilladot")
## Setting default kernel parameters
prediction_sj.svm <- predict(model_sj.svm, validation.sj)
#Plot the prediction for NBR
df_prediction_sj.svm <- data.frame('prediction' = prediction_sj.svm,
'actual' = validation.sj$total_cases,
'time' = validation.sj$week_start_date)
df_prediction_sj.svm <- melt(df_prediction_sj.svm, id.vars = 'time')
ggplot(df_prediction_sj.svm, aes(x = time, y = value, color = variable)) +
geom_line() +
ggtitle('SVM: Dengue predicted Cases vs. Actual Cases (City-San Juan) ')
# Evaluate RMSE and MAE on the validation data
RMSE.SJ.svm <- sqrt(mean((prediction_sj.svm-validation.sj$total_cases)^2))
RMSE.SJ.svm
## [1] 42.23424
MAE.SJ.svm <- mean(abs(prediction_sj.svm-validation.sj$total_cases))
MAE.SJ.svm
## [1] 19.76721
rm(df_prediction_sj.svm, model_sj.svm, prediction_sj.svm)
library(kernlab)
library(reshape2)
library(ggplot2)
#Build the model
model_iq.svm <- ksvm(total_cases ~ ., data = train.iq, kernel = "vanilladot")
## Setting default kernel parameters
prediction_iq.svm <- predict(model_iq.svm, validation.iq)
#Plot the prediction for NBR
df_prediction_iq.svm <- data.frame('prediction' = prediction_iq.svm,
'actual' = validation.iq$total_cases,
'time' = validation.iq$week_start_date)
df_prediction_iq.svm <- melt(df_prediction_iq.svm, id.vars = 'time')
ggplot(df_prediction_iq.svm, aes(x = time, y = value, color = variable)) +
geom_line() +
ggtitle('SVM: Dengue predicted Cases vs. Actual Cases (City-Iquitos) ')
# Evaluate RMSE and MAE on the validation data
RMSE.IQ.svm <- sqrt(mean((prediction_iq.svm-validation.iq$total_cases)^2))
RMSE.IQ.svm
## [1] 11.95082
MAE.IQ.svm <- mean(abs(prediction_iq.svm-validation.iq$total_cases))
MAE.IQ.svm
## [1] 6.398152
rm(df_prediction_iq.svm, model_iq.svm, prediction_iq.svm)
library(reshape2)
library(ggplot2)
library(randomForest)
library(caret)
set.seed(136)
#Build the model
model_sj.rf <- randomForest(formula = total_cases ~ ., data = train.sj[,4:24])
model_sj.rf
##
## Call:
## randomForest(formula = total_cases ~ ., data = train.sj[, 4:24])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 1444.554
## % Var explained: 48.49
prediction_sj.rf <- predict(model_sj.rf, validation.sj, type = 'response')
#Plot the prediction for NBR
df_prediction_sj.rf <- data.frame('prediction' = prediction_sj.rf,
'actual' = validation.sj$total_cases,
'time' = validation.sj$week_start_date)
df_prediction_sj.rf <- melt(df_prediction_sj.rf, id.vars = 'time')
ggplot(df_prediction_sj.rf, aes(x = time, y = value, color = variable)) +
geom_line() +
ggtitle('RF: Dengue predicted Cases vs. Actual Cases (City-San Juan) ')
# Evaluate RMSE and MAE on the validation data
RMSE.SJ.rf <- sqrt(mean((prediction_sj.rf-validation.sj$total_cases)^2))
RMSE.SJ.rf
## [1] 34.51898
MAE.SJ.rf <- mean(abs(prediction_sj.rf-validation.sj$total_cases))
MAE.SJ.rf
## [1] 24.67019
rm(df_prediction_sj.rf, model_sj.rf, prediction_sj.rf)
library(reshape2)
library(ggplot2)
library(randomForest)
library(caret)
set.seed(136)
#Build the model
model_iq.rf <- randomForest(formula = total_cases ~ ., data = train.iq[,4:24])
model_iq.rf
##
## Call:
## randomForest(formula = total_cases ~ ., data = train.iq[, 4:24])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 106.1351
## % Var explained: 3.88
prediction_iq.rf <- predict(model_iq.rf, validation.iq, type = 'response')
#Plot the prediction for NBR
df_prediction_iq.rf <- data.frame('prediction' = prediction_iq.rf,
'actual' = validation.iq$total_cases,
'time' = validation.iq$week_start_date)
df_prediction_iq.rf <- melt(df_prediction_iq.rf, id.vars = 'time')
ggplot(df_prediction_iq.rf, aes(x = time, y = value, color = variable)) +
geom_line() +
ggtitle('RF: Dengue predicted Cases vs. Actual Cases (City-IQUITOS) ')
# Evaluate RMSE and MAE on the validation data
RMSE.IQ.rf <- sqrt(mean((prediction_iq.rf-validation.iq$total_cases)^2))
RMSE.IQ.rf
## [1] 11.67287
MAE.IQ.rf <- mean(abs(prediction_iq.rf-validation.iq$total_cases))
MAE.IQ.rf
## [1] 7.544626
rm(df_prediction_iq.rf, model_iq.rf, prediction_iq.rf)
# Create a data frame with the error metrics for each method
accuracy <- data.frame(Method = c("Baseline1",
"Baseline2",
"NB Regression",
"SVM",
"Random forest"),
RMSE = c(RMSE.SJ.baseline1,
RMSE.SJ.baseline2,
RMSE.SJ.svm,
RMSE.SJ.nbr,
RMSE.SJ.rf),
MAE = c(MAE.SJ.baseline1,
MAE.SJ.baseline2,
MAE.SJ.svm,
MAE.SJ.nbr,
MAE.SJ.rf))
# Round the values and print the table
accuracy$RMSE <- round(accuracy$RMSE,2)
accuracy$MAE <- round(accuracy$MAE,2)
accuracy
## Method RMSE MAE
## 1 Baseline1 15.34 8.39
## 2 Baseline2 44.42 27.03
## 3 NB Regression 42.23 19.77
## 4 SVM 42.50 26.16
## 5 Random forest 34.52 24.67
rm(accuracy,
RMSE.SJ.baseline1,
RMSE.SJ.baseline2,
RMSE.SJ.svm,
RMSE.SJ.nbr,
RMSE.SJ.rf,
MAE.SJ.baseline1,
MAE.SJ.baseline2,
MAE.SJ.svm,
MAE.SJ.nbr,
MAE.SJ.rf)
# Create a data frame with the error metrics for each method
accuracy <- data.frame(Method = c("Baseline1",
"Baseline2",
"NB Regression",
"SVM",
"Random forest"),
RMSE = c(RMSE.IQ.baseline1,
RMSE.IQ.baseline2,
RMSE.IQ.svm,
RMSE.IQ.nbr,
RMSE.IQ.rf),
MAE = c(MAE.IQ.baseline1,
MAE.IQ.baseline2,
MAE.IQ.svm,
MAE.IQ.nbr,
MAE.IQ.rf))
# Round the values and print the table
accuracy$RMSE <- round(accuracy$RMSE,2)
accuracy$MAE <- round(accuracy$MAE,2)
accuracy
## Method RMSE MAE
## 1 Baseline1 7.66 3.93
## 2 Baseline2 11.70 7.38
## 3 NB Regression 11.95 6.40
## 4 SVM 11.93 7.74
## 5 Random forest 11.67 7.54
rm(accuracy,
RMSE.IQ.baseline1,
RMSE.IQ.baseline2,
RMSE.IQ.svm,
RMSE.IQ.nbr,
RMSE.IQ.rf,
MAE.IQ.baseline1,
MAE.IQ.baseline2,
MAE.IQ.svm,
MAE.IQ.nbr,
MAE.IQ.rf)
rm(train.iq, train.sj, validation.iq, validation.sj)
library(caret)
set.seed(136)
methods <- c("nnet","rf", "mlp", "rpart", "svmLinear", "svmRadial", "neuralnet")
performetrics <- data.frame()
#trainControl
control <- trainControl(method="repeatedcv", number=10, repeats=3)
for (i in 1:length(methods)){
#Train the model
model_sj.cv <- train(total_cases~.,
data=sj_train_labels.lastna[3:23],
method=methods[i],
trControl=control)
# summarize results
#print(methods[i])
#model_sj.cv$results["MAE"]
#model_sj.cv$results["RMSE"]
performetrics[i,1] <- methods[i]
performetrics[i,2] <- min(model_sj.cv$results["MAE"])
performetrics[i,3] <- min(model_sj.cv$results["RMSE"])
}
## # weights: 23
## initial value 3209630.731535
## final value 3172060.000000
## converged
## # weights: 67
## initial value 3197802.688580
## final value 3172060.000000
## converged
## # weights: 111
## initial value 3183375.193811
## final value 3172060.000000
## converged
## # weights: 23
## initial value 3200281.995395
## iter 10 value 3172239.909532
## iter 20 value 3172067.096499
## iter 20 value 3172067.067843
## iter 20 value 3172067.067843
## final value 3172067.067843
## converged
## # weights: 67
## initial value 3187166.247585
## iter 10 value 3172108.357417
## iter 20 value 3172064.045624
## final value 3172063.881035
## converged
## # weights: 111
## initial value 3199405.073932
## iter 10 value 3172404.826908
## iter 20 value 3172063.103337
## final value 3172062.756619
## converged
## # weights: 23
## initial value 3195874.345250
## iter 10 value 3172093.846509
## iter 20 value 3172060.396243
## final value 3172060.047947
## converged
## # weights: 67
## initial value 3203956.755399
## iter 10 value 3172229.741711
## iter 20 value 3172061.971888
## final value 3172060.070071
## converged
## # weights: 111
## initial value 3194486.476217
## iter 10 value 3172168.991766
## iter 20 value 3172061.283520
## final value 3172060.082382
## converged
## # weights: 23
## initial value 3249044.736232
## final value 3223549.000000
## converged
## # weights: 67
## initial value 3245209.784130
## final value 3223549.000000
## converged
## # weights: 111
## initial value 3248865.230244
## final value 3223549.000000
## converged
## # weights: 23
## initial value 3240170.913117
## iter 10 value 3223559.483260
## final value 3223555.987962
## converged
## # weights: 67
## initial value 3258653.147534
## iter 10 value 3224044.133102
## iter 20 value 3223566.137988
## iter 30 value 3223553.348355
## final value 3223553.185035
## converged
## # weights: 111
## initial value 3235776.207773
## iter 10 value 3223926.272917
## iter 20 value 3223552.906327
## final value 3223551.758444
## converged
## # weights: 23
## initial value 3247952.126468
## iter 10 value 3223583.367202
## iter 20 value 3223549.400897
## final value 3223549.047231
## converged
## # weights: 67
## initial value 3254265.634668
## iter 10 value 3223695.257227
## iter 20 value 3223550.705577
## final value 3223549.126293
## converged
## # weights: 111
## initial value 3256303.414416
## final value 3223557.972553
## converged
## # weights: 23
## initial value 3110247.550726
## final value 3076043.000000
## converged
## # weights: 67
## initial value 3112799.499910
## final value 3076043.000000
## converged
## # weights: 111
## initial value 3096369.196755
## final value 3076043.000000
## converged
## # weights: 23
## initial value 3103212.772398
## iter 10 value 3076089.061521
## final value 3076049.962419
## converged
## # weights: 67
## initial value 3099748.349269
## iter 10 value 3076049.937682
## iter 20 value 3076047.294181
## final value 3076046.903708
## converged
## # weights: 111
## initial value 3097708.039027
## iter 10 value 3076423.573483
## iter 20 value 3076046.099232
## final value 3076045.779487
## converged
## # weights: 23
## initial value 3097194.914019
## iter 10 value 3076073.788010
## iter 20 value 3076043.359675
## final value 3076043.042846
## converged
## # weights: 67
## initial value 3096728.038765
## iter 10 value 3076080.844093
## iter 20 value 3076043.449343
## final value 3076043.080566
## converged
## # weights: 111
## initial value 3088181.780633
## iter 10 value 3076124.348723
## iter 20 value 3076043.966954
## final value 3076043.070526
## converged
## # weights: 23
## initial value 3348836.239697
## final value 3333492.000000
## converged
## # weights: 67
## initial value 3363172.155721
## final value 3333492.000000
## converged
## # weights: 111
## initial value 3372505.034740
## final value 3333492.000000
## converged
## # weights: 23
## initial value 3358434.729853
## iter 10 value 3333553.348634
## iter 20 value 3333499.046939
## iter 20 value 3333499.032860
## iter 20 value 3333499.032135
## final value 3333499.032135
## converged
## # weights: 67
## initial value 3373530.954683
## iter 10 value 3333505.648196
## final value 3333497.620753
## converged
## # weights: 111
## initial value 3369647.092954
## iter 10 value 3333900.074205
## iter 20 value 3333495.395439
## final value 3333495.134774
## converged
## # weights: 23
## initial value 3357839.571697
## iter 10 value 3333529.261629
## iter 20 value 3333492.435832
## final value 3333492.052390
## converged
## # weights: 67
## initial value 3348865.620035
## iter 10 value 3333559.300594
## iter 20 value 3333492.793479
## final value 3333492.099765
## converged
## # weights: 111
## initial value 3365216.613903
## iter 10 value 3333693.698045
## iter 20 value 3333494.353151
## final value 3333492.166220
## converged
## # weights: 23
## initial value 3141073.838283
## final value 3120988.000000
## converged
## # weights: 67
## initial value 3148026.329035
## final value 3120988.000000
## converged
## # weights: 111
## initial value 3157828.233653
## final value 3120988.000000
## converged
## # weights: 23
## initial value 3144428.292469
## iter 10 value 3121157.094096
## final value 3120994.972953
## converged
## # weights: 67
## initial value 3154261.526797
## iter 10 value 3121039.472453
## iter 20 value 3120992.969465
## iter 20 value 3120992.956099
## iter 20 value 3120992.956099
## final value 3120992.956099
## converged
## # weights: 111
## initial value 3151745.614646
## iter 10 value 3121061.078223
## iter 20 value 3120991.425254
## final value 3120991.209412
## converged
## # weights: 23
## initial value 3150557.807793
## iter 10 value 3121023.331498
## iter 20 value 3120988.412241
## final value 3120988.048652
## converged
## # weights: 67
## initial value 3140136.926537
## iter 10 value 3121100.484013
## iter 20 value 3120989.316104
## final value 3120988.094238
## converged
## # weights: 111
## initial value 3154361.325150
## iter 10 value 3121153.346111
## iter 20 value 3120989.935360
## final value 3120988.111752
## converged
## # weights: 23
## initial value 3079870.099543
## final value 3054256.000000
## converged
## # weights: 67
## initial value 3077001.574388
## final value 3054256.000000
## converged
## # weights: 111
## initial value 3095480.288624
## final value 3054256.000000
## converged
## # weights: 23
## initial value 3075523.687495
## iter 10 value 3054265.374457
## final value 3054262.982702
## converged
## # weights: 67
## initial value 3078378.358658
## iter 10 value 3054290.763960
## iter 20 value 3054260.041464
## iter 20 value 3054260.022610
## final value 3054259.920833
## converged
## # weights: 111
## initial value 3091312.372691
## iter 10 value 3054470.241965
## final value 3054259.355974
## converged
## # weights: 23
## initial value 3086639.516313
## iter 10 value 3054392.134231
## iter 20 value 3054257.575409
## final value 3054256.069525
## converged
## # weights: 67
## initial value 3077235.766942
## iter 10 value 3054311.470741
## iter 20 value 3054256.657345
## final value 3054256.061881
## converged
## # weights: 111
## initial value 3085408.103725
## final value 3054258.646258
## converged
## # weights: 23
## initial value 3223749.640386
## final value 3190550.000000
## converged
## # weights: 67
## initial value 3214577.605914
## final value 3190550.000000
## converged
## # weights: 111
## initial value 3233920.941134
## final value 3190550.000000
## converged
## # weights: 23
## initial value 3227661.094225
## iter 10 value 3191737.286454
## iter 20 value 3190564.820660
## final value 3190556.975841
## converged
## # weights: 67
## initial value 3231694.536349
## iter 10 value 3190598.618556
## iter 20 value 3190555.689146
## final value 3190553.929168
## converged
## # weights: 111
## initial value 3218640.119559
## iter 10 value 3190751.630465
## iter 20 value 3190553.178515
## final value 3190552.777677
## converged
## # weights: 23
## initial value 3215363.685906
## iter 10 value 3190584.405900
## iter 20 value 3190550.401525
## final value 3190550.047462
## converged
## # weights: 67
## initial value 3228262.599575
## iter 10 value 3190648.388881
## iter 20 value 3190551.154530
## final value 3190550.072029
## converged
## # weights: 111
## initial value 3223132.848869
## iter 10 value 3190590.742391
## iter 20 value 3190550.492512
## final value 3190550.073378
## converged
## # weights: 23
## initial value 3300486.542786
## final value 3259705.000000
## converged
## # weights: 67
## initial value 3279683.018840
## final value 3259705.000000
## converged
## # weights: 111
## initial value 3270456.467898
## final value 3259705.000000
## converged
## # weights: 23
## initial value 3282122.639608
## iter 10 value 3259720.244123
## final value 3259717.495917
## converged
## # weights: 67
## initial value 3286047.526814
## iter 10 value 3259857.964682
## iter 20 value 3259709.226134
## final value 3259708.932108
## converged
## # weights: 111
## initial value 3275153.913687
## iter 10 value 3259873.163928
## iter 20 value 3259708.252198
## iter 20 value 3259708.226259
## iter 20 value 3259708.221202
## final value 3259708.221202
## converged
## # weights: 23
## initial value 3287653.303985
## iter 10 value 3259778.767736
## iter 20 value 3259705.856807
## final value 3259705.063864
## converged
## # weights: 67
## initial value 3285087.257291
## iter 10 value 3259822.897615
## iter 20 value 3259706.377833
## final value 3259705.073785
## converged
## # weights: 111
## initial value 3278655.780170
## final value 3259705.040746
## converged
## # weights: 23
## initial value 3015949.565057
## final value 2993143.000000
## converged
## # weights: 67
## initial value 3024603.134634
## final value 2993143.000000
## converged
## # weights: 111
## initial value 3021205.493431
## final value 2993143.000000
## converged
## # weights: 23
## initial value 3017800.626752
## iter 10 value 2993240.542391
## iter 20 value 2993150.342436
## final value 2993149.959774
## converged
## # weights: 67
## initial value 3014417.043287
## iter 10 value 2993169.289412
## iter 20 value 2993148.131758
## final value 2993146.901342
## converged
## # weights: 111
## initial value 3014060.556502
## iter 10 value 2993155.293509
## iter 20 value 2993146.422220
## final value 2993145.821403
## converged
## # weights: 23
## initial value 3018655.129383
## iter 10 value 2993210.261010
## iter 20 value 2993143.782043
## final value 2993143.040717
## converged
## # weights: 67
## initial value 3021686.820563
## iter 10 value 2993222.754251
## iter 20 value 2993143.935431
## final value 2993143.056474
## converged
## # weights: 111
## initial value 3014204.721978
## iter 10 value 2993263.812354
## iter 20 value 2993144.423934
## final value 2993143.119489
## converged
## # weights: 23
## initial value 3097514.307528
## final value 3066593.000000
## converged
## # weights: 67
## initial value 3076147.206548
## final value 3066593.000000
## converged
## # weights: 111
## initial value 3080946.009980
## final value 3066593.000000
## converged
## # weights: 23
## initial value 3096714.698577
## iter 10 value 3066611.738406
## final value 3066600.000760
## converged
## # weights: 67
## initial value 3087639.604104
## iter 10 value 3066971.859218
## iter 20 value 3066598.488990
## final value 3066598.001794
## converged
## # weights: 111
## initial value 3092190.103300
## iter 10 value 3066607.205304
## iter 20 value 3066597.191252
## iter 30 value 3066595.816477
## final value 3066595.758265
## converged
## # weights: 23
## initial value 3088249.561681
## iter 10 value 3066655.438059
## iter 20 value 3066593.726959
## final value 3066593.056598
## converged
## # weights: 67
## initial value 3097893.735604
## iter 10 value 3066681.237707
## iter 20 value 3066594.034313
## final value 3066593.117766
## converged
## # weights: 111
## initial value 3100473.779459
## iter 10 value 3066721.655984
## iter 20 value 3066594.511691
## final value 3066593.088710
## converged
## # weights: 23
## initial value 3372222.164415
## final value 3336258.000000
## converged
## # weights: 67
## initial value 3375305.028493
## final value 3336258.000000
## converged
## # weights: 111
## initial value 3371822.975021
## final value 3336258.000000
## converged
## # weights: 23
## initial value 3368261.988962
## iter 10 value 3336267.319449
## final value 3336264.997942
## converged
## # weights: 67
## initial value 3365797.333298
## iter 10 value 3336368.843175
## final value 3336262.160113
## converged
## # weights: 111
## initial value 3371034.103749
## iter 10 value 3336435.333935
## iter 20 value 3336260.969075
## iter 20 value 3336260.959066
## final value 3336260.776710
## converged
## # weights: 23
## initial value 3369761.411609
## iter 10 value 3336327.403963
## iter 20 value 3336258.805793
## final value 3336258.060625
## converged
## # weights: 67
## initial value 3361082.229973
## final value 3336258.632746
## converged
## # weights: 111
## initial value 3368099.287314
## iter 10 value 3336403.004804
## iter 20 value 3336259.703144
## final value 3336258.144614
## converged
## # weights: 23
## initial value 3219235.755473
## final value 3188931.000000
## converged
## # weights: 67
## initial value 3220907.814144
## final value 3188931.000000
## converged
## # weights: 111
## initial value 3228172.890360
## final value 3188931.000000
## converged
## # weights: 23
## initial value 3217071.444658
## iter 10 value 3189200.055396
## iter 20 value 3188939.198368
## final value 3188938.032983
## converged
## # weights: 67
## initial value 3217820.261349
## iter 10 value 3188945.143215
## iter 20 value 3188935.781547
## final value 3188935.618098
## converged
## # weights: 111
## initial value 3224162.207852
## iter 10 value 3189301.972761
## iter 20 value 3188934.093573
## final value 3188933.746702
## converged
## # weights: 23
## initial value 3212464.915480
## iter 10 value 3188998.115057
## iter 20 value 3188931.779179
## final value 3188931.076347
## converged
## # weights: 67
## initial value 3217500.369924
## iter 10 value 3189009.567477
## iter 20 value 3188931.923061
## final value 3188931.078609
## converged
## # weights: 111
## initial value 3219675.518465
## iter 10 value 3189009.396443
## iter 20 value 3188931.935607
## final value 3188931.075546
## converged
## # weights: 23
## initial value 3402949.467121
## final value 3365946.000000
## converged
## # weights: 67
## initial value 3389943.934732
## final value 3365946.000000
## converged
## # weights: 111
## initial value 3395156.546999
## final value 3365946.000000
## converged
## # weights: 23
## initial value 3397784.260558
## iter 10 value 3365956.214150
## final value 3365952.998832
## converged
## # weights: 67
## initial value 3400824.997270
## iter 10 value 3366174.109824
## iter 20 value 3365950.418703
## final value 3365949.918008
## converged
## # weights: 111
## initial value 3389725.298422
## iter 10 value 3366287.308587
## iter 20 value 3365950.895317
## final value 3365948.873529
## converged
## # weights: 23
## initial value 3400030.904051
## iter 10 value 3365982.931188
## iter 20 value 3365946.433185
## final value 3365946.053153
## converged
## # weights: 67
## initial value 3387998.157928
## iter 10 value 3366016.792199
## iter 20 value 3365946.835039
## final value 3365946.095885
## converged
## # weights: 111
## initial value 3396084.482973
## final value 3365946.056456
## converged
## # weights: 23
## initial value 2875200.386612
## final value 2840876.000000
## converged
## # weights: 67
## initial value 2884539.824092
## final value 2840876.000000
## converged
## # weights: 111
## initial value 2859804.607379
## final value 2840876.000000
## converged
## # weights: 23
## initial value 2861785.385640
## iter 10 value 2840905.966764
## final value 2840882.955714
## converged
## # weights: 67
## initial value 2878768.970805
## iter 10 value 2840922.612440
## iter 20 value 2840881.288641
## final value 2840880.912545
## converged
## # weights: 111
## initial value 2879533.034639
## iter 10 value 2840908.153646
## iter 20 value 2840884.022118
## final value 2840881.587182
## converged
## # weights: 23
## initial value 2866924.386694
## iter 10 value 2840909.532004
## iter 20 value 2840876.393786
## final value 2840876.048736
## converged
## # weights: 67
## initial value 2875196.720113
## iter 10 value 2840936.071543
## iter 20 value 2840876.710843
## final value 2840876.086918
## converged
## # weights: 111
## initial value 2855944.562223
## iter 10 value 2840964.468249
## iter 20 value 2840877.050177
## final value 2840876.076135
## converged
## # weights: 23
## initial value 3326304.180783
## final value 3291403.000000
## converged
## # weights: 67
## initial value 3328806.258254
## final value 3291403.000000
## converged
## # weights: 111
## initial value 3327589.572318
## final value 3291403.000000
## converged
## # weights: 23
## initial value 3328804.551539
## iter 10 value 3291417.300037
## final value 3291410.013852
## converged
## # weights: 67
## initial value 3337928.733960
## iter 10 value 3291413.925808
## final value 3291410.016077
## converged
## # weights: 111
## initial value 3339787.876057
## iter 10 value 3291454.585761
## iter 20 value 3291417.730654
## iter 30 value 3291407.629054
## iter 40 value 3291405.791689
## iter 40 value 3291405.776020
## iter 40 value 3291405.764281
## final value 3291405.764281
## converged
## # weights: 23
## initial value 3326209.479733
## iter 10 value 3291469.390093
## iter 20 value 3291403.770720
## final value 3291403.057910
## converged
## # weights: 67
## initial value 3335453.410643
## iter 10 value 3291472.296860
## iter 20 value 3291403.812956
## final value 3291403.093157
## converged
## # weights: 111
## initial value 3330082.550379
## iter 10 value 3291600.609291
## iter 20 value 3291405.303193
## final value 3291403.117416
## converged
## # weights: 23
## initial value 3319660.932888
## final value 3282104.000000
## converged
## # weights: 67
## initial value 3317563.668671
## final value 3282104.000000
## converged
## # weights: 111
## initial value 3320364.230834
## final value 3282104.000000
## converged
## # weights: 23
## initial value 3318914.670622
## iter 10 value 3282621.697996
## iter 20 value 3282117.116367
## final value 3282110.979144
## converged
## # weights: 67
## initial value 3313132.255289
## iter 10 value 3282127.230086
## iter 20 value 3282110.381065
## final value 3282107.901824
## converged
## # weights: 111
## initial value 3321573.349911
## iter 10 value 3282168.769065
## iter 20 value 3282107.059728
## iter 20 value 3282107.043387
## final value 3282106.770804
## converged
## # weights: 23
## initial value 3312606.232289
## iter 10 value 3282139.538538
## iter 20 value 3282104.417149
## final value 3282104.051449
## converged
## # weights: 67
## initial value 3303108.547357
## iter 10 value 3282196.326271
## iter 20 value 3282105.079972
## final value 3282104.062438
## converged
## # weights: 111
## initial value 3299598.735759
## final value 3282104.056011
## converged
## # weights: 23
## initial value 3276110.364056
## final value 3241576.000000
## converged
## # weights: 67
## initial value 3276694.993988
## final value 3241576.000000
## converged
## # weights: 111
## initial value 3274196.355228
## final value 3241576.000000
## converged
## # weights: 23
## initial value 3274325.456662
## iter 10 value 3241588.473112
## iter 20 value 3241583.193449
## final value 3241583.014296
## converged
## # weights: 67
## initial value 3266259.470317
## iter 10 value 3241583.790451
## final value 3241579.899445
## converged
## # weights: 111
## initial value 3274214.942803
## iter 10 value 3241944.243640
## iter 20 value 3241579.543018
## final value 3241578.782761
## converged
## # weights: 23
## initial value 3279888.299989
## iter 10 value 3241630.758710
## iter 20 value 3241576.636109
## final value 3241576.048186
## converged
## # weights: 67
## initial value 3265125.117355
## iter 10 value 3241731.365952
## iter 20 value 3241577.810049
## final value 3241576.141604
## converged
## # weights: 111
## initial value 3268834.765556
## iter 10 value 3241628.688400
## iter 20 value 3241576.640716
## final value 3241576.075252
## converged
## # weights: 23
## initial value 2856308.164881
## final value 2828007.000000
## converged
## # weights: 67
## initial value 2859262.029164
## final value 2828007.000000
## converged
## # weights: 111
## initial value 2853868.476784
## final value 2828007.000000
## converged
## # weights: 23
## initial value 2847914.798213
## iter 10 value 2828082.331540
## iter 20 value 2828014.200742
## final value 2828013.972055
## converged
## # weights: 67
## initial value 2847302.023729
## iter 10 value 2828026.670250
## final value 2828010.866284
## converged
## # weights: 111
## initial value 2850368.026241
## iter 10 value 2828177.612752
## iter 20 value 2828015.911441
## iter 30 value 2828010.393183
## final value 2828010.025798
## converged
## # weights: 23
## initial value 2854592.985367
## iter 10 value 2828066.525005
## iter 20 value 2828007.693438
## final value 2828007.054356
## converged
## # weights: 67
## initial value 2851904.723387
## iter 10 value 2828087.133793
## iter 20 value 2828007.940377
## final value 2828007.075117
## converged
## # weights: 111
## initial value 2853096.441452
## iter 10 value 2828081.492447
## iter 20 value 2828007.885195
## final value 2828007.084623
## converged
## # weights: 23
## initial value 2966235.039508
## final value 2930948.000000
## converged
## # weights: 67
## initial value 2951275.393781
## final value 2930948.000000
## converged
## # weights: 111
## initial value 2948696.206051
## final value 2930948.000000
## converged
## # weights: 23
## initial value 2953783.516532
## iter 10 value 2931094.810595
## final value 2930954.952779
## converged
## # weights: 67
## initial value 2943792.385710
## iter 10 value 2931168.091348
## iter 20 value 2930952.031346
## final value 2930951.911070
## converged
## # weights: 111
## initial value 2966017.471498
## iter 10 value 2931114.066788
## iter 20 value 2930951.391641
## final value 2930950.755428
## converged
## # weights: 23
## initial value 2948245.571813
## iter 10 value 2930996.300416
## iter 20 value 2930948.561482
## final value 2930948.050875
## converged
## # weights: 67
## initial value 2961996.697728
## iter 10 value 2931072.738043
## iter 20 value 2930949.459193
## final value 2930948.061684
## converged
## # weights: 111
## initial value 2975826.740242
## iter 10 value 2931023.874138
## iter 20 value 2930948.899991
## final value 2930948.162525
## converged
## # weights: 23
## initial value 3200185.095834
## final value 3184330.000000
## converged
## # weights: 67
## initial value 3206181.245436
## final value 3184330.000000
## converged
## # weights: 111
## initial value 3223421.899216
## final value 3184330.000000
## converged
## # weights: 23
## initial value 3222441.509129
## iter 10 value 3184340.670116
## final value 3184337.002946
## converged
## # weights: 67
## initial value 3217821.127839
## iter 10 value 3184528.733266
## iter 20 value 3184334.114894
## final value 3184333.909782
## converged
## # weights: 111
## initial value 3208971.569779
## iter 10 value 3184555.875730
## iter 20 value 3184363.549698
## iter 30 value 3184333.850437
## final value 3184332.769218
## converged
## # weights: 23
## initial value 3209361.146383
## iter 10 value 3184379.125503
## iter 20 value 3184330.574214
## final value 3184330.046800
## converged
## # weights: 67
## initial value 3211828.645847
## iter 10 value 3184424.213141
## iter 20 value 3184331.102153
## final value 3184330.063824
## converged
## # weights: 111
## initial value 3200382.366016
## iter 10 value 3184443.891626
## iter 20 value 3184331.347777
## final value 3184330.071907
## converged
## # weights: 23
## initial value 3164955.329130
## final value 3134954.000000
## converged
## # weights: 67
## initial value 3156465.788861
## final value 3134954.000000
## converged
## # weights: 111
## initial value 3182534.573484
## final value 3134954.000000
## converged
## # weights: 23
## initial value 3177458.531104
## iter 10 value 3134984.144214
## iter 20 value 3134961.237630
## final value 3134960.980755
## converged
## # weights: 67
## initial value 3164528.679798
## iter 10 value 3134971.292874
## final value 3134957.953488
## converged
## # weights: 111
## initial value 3167597.610902
## iter 10 value 3134973.001492
## iter 20 value 3134957.418505
## iter 20 value 3134957.390890
## final value 3134957.226191
## converged
## # weights: 23
## initial value 3172176.403577
## iter 10 value 3135011.124067
## iter 20 value 3134954.662453
## final value 3134954.069033
## converged
## # weights: 67
## initial value 3168670.919047
## iter 10 value 3135032.731438
## iter 20 value 3134954.927634
## final value 3134954.059982
## converged
## # weights: 111
## initial value 3168581.913091
## iter 10 value 3135113.593496
## iter 20 value 3134955.868479
## final value 3134954.098624
## converged
## # weights: 23
## initial value 3134225.652481
## final value 3112880.000000
## converged
## # weights: 67
## initial value 3137796.040395
## final value 3112880.000000
## converged
## # weights: 111
## initial value 3151636.990600
## final value 3112880.000000
## converged
## # weights: 23
## initial value 3149310.943143
## iter 10 value 3112898.412613
## final value 3112887.007261
## converged
## # weights: 67
## initial value 3133585.284907
## iter 10 value 3112891.489925
## final value 3112885.138366
## converged
## # weights: 111
## initial value 3122000.537793
## iter 10 value 3112890.183519
## iter 20 value 3112884.055507
## iter 30 value 3112882.902151
## final value 3112882.764730
## converged
## # weights: 23
## initial value 3151633.457629
## iter 10 value 3112973.404137
## iter 20 value 3112881.082000
## final value 3112880.073337
## converged
## # weights: 67
## initial value 3140630.286775
## iter 10 value 3112936.099231
## iter 20 value 3112880.664908
## final value 3112880.062698
## converged
## # weights: 111
## initial value 3149900.122302
## iter 10 value 3112979.676869
## iter 20 value 3112881.178881
## final value 3112880.143596
## converged
## # weights: 23
## initial value 3268016.068991
## final value 3249187.000000
## converged
## # weights: 67
## initial value 3268144.034674
## final value 3249187.000000
## converged
## # weights: 111
## initial value 3283943.308673
## final value 3249187.000000
## converged
## # weights: 23
## initial value 3273292.881519
## iter 10 value 3249199.302988
## final value 3249194.016795
## converged
## # weights: 67
## initial value 3273661.113329
## iter 10 value 3249251.600098
## iter 20 value 3249195.624578
## iter 30 value 3249191.205841
## final value 3249190.923814
## converged
## # weights: 111
## initial value 3271561.800043
## iter 10 value 3249238.638861
## iter 20 value 3249201.243493
## iter 30 value 3249190.682363
## iter 40 value 3249189.751984
## iter 40 value 3249189.750619
## iter 40 value 3249189.750377
## final value 3249189.750377
## converged
## # weights: 23
## initial value 3274734.560823
## iter 10 value 3249256.661323
## iter 20 value 3249187.808362
## final value 3249187.060427
## converged
## # weights: 67
## initial value 3271660.240679
## iter 10 value 3249219.773227
## iter 20 value 3249187.396483
## final value 3249187.059334
## converged
## # weights: 111
## initial value 3272843.465883
## final value 3249187.043611
## converged
## # weights: 23
## initial value 3237494.578885
## final value 3211375.000000
## converged
## # weights: 67
## initial value 3233750.308575
## final value 3211375.000000
## converged
## # weights: 111
## initial value 3228582.810636
## final value 3211375.000000
## converged
## # weights: 23
## initial value 3250436.751642
## iter 10 value 3211427.819101
## final value 3211381.981478
## converged
## # weights: 67
## initial value 3232370.053395
## iter 10 value 3211389.624619
## iter 20 value 3211380.226152
## final value 3211378.900563
## converged
## # weights: 111
## initial value 3241113.516059
## iter 10 value 3211736.616027
## iter 20 value 3211378.091771
## final value 3211377.889768
## converged
## # weights: 23
## initial value 3241087.125639
## iter 10 value 3211436.322071
## iter 20 value 3211375.714329
## final value 3211375.055953
## converged
## # weights: 67
## initial value 3238068.774407
## iter 10 value 3211482.363457
## iter 20 value 3211376.253021
## final value 3211375.072160
## converged
## # weights: 111
## initial value 3233423.623787
## iter 10 value 3211440.789762
## iter 20 value 3211375.786664
## final value 3211375.080532
## converged
## # weights: 23
## initial value 3281544.508196
## final value 3259024.000000
## converged
## # weights: 67
## initial value 3285624.995426
## final value 3259024.000000
## converged
## # weights: 111
## initial value 3290295.338066
## final value 3259024.000000
## converged
## # weights: 23
## initial value 3292477.168145
## iter 10 value 3259759.018096
## iter 20 value 3259037.644888
## final value 3259036.491096
## converged
## # weights: 67
## initial value 3292437.571879
## iter 10 value 3259229.294154
## iter 20 value 3259027.942876
## iter 20 value 3259027.940576
## iter 20 value 3259027.928763
## final value 3259027.928763
## converged
## # weights: 111
## initial value 3274705.295826
## iter 10 value 3259395.135778
## iter 20 value 3259027.282145
## final value 3259026.796791
## converged
## # weights: 23
## initial value 3296859.733007
## iter 10 value 3259733.733578
## iter 20 value 3259032.182671
## iter 30 value 3259024.094340
## final value 3259024.038622
## converged
## # weights: 67
## initial value 3290490.606168
## iter 10 value 3259138.701730
## iter 20 value 3259025.339810
## final value 3259024.106921
## converged
## # weights: 111
## initial value 3295055.361910
## final value 3259024.145614
## converged
## # weights: 23
## initial value 2982956.316083
## final value 2945856.000000
## converged
## # weights: 67
## initial value 2963508.537360
## final value 2945856.000000
## converged
## # weights: 111
## initial value 2959916.723746
## final value 2945856.000000
## converged
## # weights: 23
## initial value 2966917.912006
## iter 10 value 2945964.643062
## iter 20 value 2945868.465562
## final value 2945863.010430
## converged
## # weights: 67
## initial value 2968132.172888
## iter 10 value 2945864.447163
## final value 2945861.029930
## converged
## # weights: 111
## initial value 2978581.916588
## iter 10 value 2946169.075382
## iter 20 value 2945859.504909
## final value 2945859.224339
## converged
## # weights: 23
## initial value 2961181.613540
## iter 10 value 2945907.405535
## iter 20 value 2945856.600011
## final value 2945856.048112
## converged
## # weights: 67
## initial value 2960933.284449
## iter 10 value 2945946.879058
## iter 20 value 2945857.063564
## final value 2945856.061984
## converged
## # weights: 111
## initial value 2983978.621513
## iter 10 value 2945995.821764
## iter 20 value 2945857.639569
## final value 2945856.129807
## converged
## # weights: 23
## initial value 3036469.812885
## final value 3007919.000000
## converged
## # weights: 67
## initial value 3038778.889324
## final value 3007919.000000
## converged
## # weights: 111
## initial value 3022428.073257
## final value 3007919.000000
## converged
## # weights: 23
## initial value 3024446.681053
## iter 10 value 3007926.098056
## final value 3007925.977955
## converged
## # weights: 67
## initial value 3036629.200722
## iter 10 value 3007934.608218
## iter 20 value 3007923.068333
## final value 3007922.872384
## converged
## # weights: 111
## initial value 3044947.988714
## iter 10 value 3008280.597137
## iter 20 value 3007924.990708
## final value 3007922.250018
## converged
## # weights: 23
## initial value 3042309.681169
## iter 10 value 3008059.885393
## iter 20 value 3007920.628631
## final value 3007919.070177
## converged
## # weights: 67
## initial value 3039082.750242
## iter 10 value 3008034.065790
## iter 20 value 3007920.343778
## final value 3007919.148520
## converged
## # weights: 111
## initial value 3022537.574001
## final value 3007919.049377
## converged
## # weights: 23
## initial value 3192074.739526
## final value 3154163.000000
## converged
## # weights: 67
## initial value 3190902.249499
## final value 3154163.000000
## converged
## # weights: 111
## initial value 3197053.490962
## final value 3154163.000000
## converged
## # weights: 23
## initial value 3179867.605178
## iter 10 value 3154396.912467
## final value 3154169.992770
## converged
## # weights: 67
## initial value 3175717.094462
## iter 10 value 3154189.096683
## iter 20 value 3154167.080181
## iter 20 value 3154167.059679
## iter 20 value 3154167.059679
## final value 3154167.059679
## converged
## # weights: 111
## initial value 3178217.418182
## iter 10 value 3154331.007034
## final value 3154173.183228
## converged
## # weights: 23
## initial value 3191526.313992
## iter 10 value 3154218.156781
## iter 20 value 3154163.640396
## final value 3154163.067419
## converged
## # weights: 67
## initial value 3180218.528679
## iter 10 value 3154234.263703
## iter 20 value 3154163.839725
## final value 3154163.104268
## converged
## # weights: 111
## initial value 3177797.481622
## iter 10 value 3154254.744368
## iter 20 value 3154164.085722
## final value 3154163.099708
## converged
## # weights: 23
## initial value 3204415.542651
## final value 3177835.000000
## converged
## # weights: 67
## initial value 3194059.096293
## final value 3177835.000000
## converged
## # weights: 111
## initial value 3196395.227032
## final value 3177835.000000
## converged
## # weights: 23
## initial value 3213441.361630
## iter 10 value 3177854.501907
## final value 3177841.978359
## converged
## # weights: 67
## initial value 3201567.874644
## iter 10 value 3178310.281592
## iter 20 value 3177839.950674
## final value 3177838.967177
## converged
## # weights: 111
## initial value 3201568.060069
## iter 10 value 3177846.415203
## iter 20 value 3177842.981113
## iter 30 value 3177838.310169
## final value 3177837.781601
## converged
## # weights: 23
## initial value 3202101.487795
## iter 10 value 3177868.999589
## iter 20 value 3177835.396234
## final value 3177835.046348
## converged
## # weights: 67
## initial value 3208565.273266
## iter 10 value 3177939.217128
## iter 20 value 3177836.215643
## final value 3177835.090279
## converged
## # weights: 111
## initial value 3205590.078696
## iter 10 value 3177970.012346
## iter 20 value 3177836.585779
## final value 3177835.073211
## converged
## # weights: 23
## initial value 3271921.107095
## final value 3237186.000000
## converged
## # weights: 67
## initial value 3258223.970317
## final value 3237186.000000
## converged
## # weights: 111
## initial value 3256361.494328
## final value 3237186.000000
## converged
## # weights: 23
## initial value 3275658.365772
## iter 10 value 3237325.664099
## iter 20 value 3237193.338395
## final value 3237193.140514
## converged
## # weights: 67
## initial value 3271542.985178
## iter 10 value 3237417.102501
## iter 20 value 3237190.742023
## iter 20 value 3237190.710725
## final value 3237189.937074
## converged
## # weights: 111
## initial value 3269548.826162
## iter 10 value 3237193.700319
## iter 20 value 3237189.362138
## final value 3237188.774944
## converged
## # weights: 23
## initial value 3269597.909854
## iter 10 value 3237225.019547
## iter 20 value 3237186.455448
## final value 3237186.053908
## converged
## # weights: 67
## initial value 3248593.093609
## iter 10 value 3237227.028428
## iter 20 value 3237186.488186
## final value 3237186.066047
## converged
## # weights: 111
## initial value 3255889.199338
## final value 3237186.051690
## converged
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## # weights: 23
## initial value 3539189.970922
## final value 3498931.000000
## converged
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold01.Rep1: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold02.Rep1: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold03.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold04.Rep1: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold05.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold06.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold08.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold09.Rep1: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold09.Rep1: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold02.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold03.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold04.Rep2: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold05.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold06.Rep2: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold06.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold07.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold10.Rep2: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold10.Rep2: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold03.Rep3: layer1=3, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold04.Rep3: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold08.Rep3: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
## Warning in is.na(weights): is.na() applied to non-(list or vector) of type
## 'NULL'
## Warning: predictions failed for Fold09.Rep3: layer1=5, layer2=0, layer3=0 Error in nrow[w] * ncol[w] : non-numeric argument to binary operator
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Warning: algorithm did not converge in 1 of 1 repetition(s) within the
## stepmax
colnames(performetrics)[1]<- "Method"
colnames(performetrics)[2]<- "MAE"
colnames(performetrics)[3]<- "RMSE"
View(performetrics)
rm(i, control, methods, model_sj.cv, performetrics)
ts_sj <- ts(sj_train_labels.lastna$total_cases, start = c(min(sj_train_labels.lastna$year),min(sj_train_labels.lastna$weekofyear[sj_train_labels.lastna$year == min(sj_train_labels.lastna$year)])), frequency = 52)
plot((ts_sj) , main = 'SJ: Total_cases')
plot(decompose(ts_sj))
fit1 <- HoltWinters(ts_sj)
fit2<- HoltWinters(ts_sj, beta = FALSE, gamma = FALSE)
par(mfrow=c(2,1))
plot(fit1)
plot(fit2)